diff --git a/.github/workflows/codesee-arch-diagram.yml b/.github/workflows/codesee-arch-diagram.yml
new file mode 100644
index 000000000..806d41d12
--- /dev/null
+++ b/.github/workflows/codesee-arch-diagram.yml
@@ -0,0 +1,23 @@
+# This workflow was added by CodeSee. Learn more at https://codesee.io/
+# This is v2.0 of this workflow file
+on:
+ push:
+ branches:
+ - main
+ pull_request_target:
+ types: [opened, synchronize, reopened]
+
+name: CodeSee
+
+permissions: read-all
+
+jobs:
+ codesee:
+ runs-on: ubuntu-latest
+ continue-on-error: true
+ name: Analyze the repo with CodeSee
+ steps:
+ - uses: Codesee-io/codesee-action@v2
+ with:
+ codesee-token: ${{ secrets.CODESEE_ARCH_DIAG_API_TOKEN }}
+ codesee-url: https://app.codesee.io
diff --git a/api/api/version_endpoints_api.go b/api/api/version_endpoints_api.go
index 531848794..98022671b 100644
--- a/api/api/version_endpoints_api.go
+++ b/api/api/version_endpoints_api.go
@@ -392,9 +392,13 @@ func validateUpdateRequest(prev *models.VersionEndpoint, new *models.VersionEndp
return fmt.Errorf("Updating environment is not allowed, previous: %s, new: %s", prev.EnvironmentName, new.EnvironmentName)
}
+ if prev.Status == models.EndpointPending {
+ return fmt.Errorf("Updating endpoint status to %s is not allowed when the endpoint is currently in the pending state", new.Status)
+ }
+
if new.Status != prev.Status {
if prev.Status == models.EndpointServing {
- return fmt.Errorf("Updating endpoint status to %s is not allowed when the endpoint is in serving state", new.Status)
+ return fmt.Errorf("Updating endpoint status to %s is not allowed when the endpoint is currently in the serving state", new.Status)
}
if new.Status != models.EndpointRunning && new.Status != models.EndpointTerminated {
diff --git a/api/api/version_endpoints_api_test.go b/api/api/version_endpoints_api_test.go
index 01088c9bf..e61dffade 100644
--- a/api/api/version_endpoints_api_test.go
+++ b/api/api/version_endpoints_api_test.go
@@ -3394,7 +3394,7 @@ func TestUpdateEndpoint(t *testing.T) {
ID: uuid,
VersionID: models.ID(1),
VersionModelID: models.ID(1),
- Status: models.EndpointPending,
+ Status: models.EndpointRunning,
ServiceName: "sample",
InferenceServiceName: "sample",
Namespace: "sample",
@@ -3648,7 +3648,120 @@ func TestUpdateEndpoint(t *testing.T) {
},
expected: &Response{
code: http.StatusBadRequest,
- data: Error{Message: "Error validating request: Updating endpoint status to running is not allowed when the endpoint is in serving state"},
+ data: Error{Message: "Error validating request: Updating endpoint status to running is not allowed when the endpoint is currently in the serving state"},
+ },
+ },
+ {
+ desc: "Should 400 if endpoint status is in the pending state",
+ vars: map[string]string{
+ "model_id": "1",
+ "version_id": "1",
+ "endpoint_id": uuid.String(),
+ },
+ requestBody: &models.VersionEndpoint{
+ ID: uuid,
+ VersionID: models.ID(1),
+ VersionModelID: models.ID(1),
+ Status: models.EndpointRunning,
+ ServiceName: "sample",
+ Namespace: "sample",
+ EnvironmentName: "dev",
+ Message: "",
+ ResourceRequest: &models.ResourceRequest{
+ MinReplica: 1,
+ MaxReplica: 4,
+ CPURequest: resource.MustParse("1"),
+ MemoryRequest: resource.MustParse("1Gi"),
+ },
+ EnvVars: models.EnvVars([]models.EnvVar{
+ {
+ Name: "WORKER",
+ Value: "1",
+ },
+ }),
+ },
+ modelService: func() *mocks.ModelsService {
+ svc := &mocks.ModelsService{}
+ svc.On("FindByID", context.Background(), models.ID(1)).Return(&models.Model{
+ ID: models.ID(1),
+ Name: "model-1",
+ ProjectID: models.ID(1),
+ Project: mlp.Project{},
+ ExperimentID: 1,
+ Type: "pyfunc",
+ MlflowURL: "",
+ Endpoints: nil,
+ }, nil)
+ return svc
+ },
+ versionService: func() *mocks.VersionsService {
+ svc := &mocks.VersionsService{}
+ svc.On("FindByID", context.Background(), models.ID(1), models.ID(1), mock.Anything).Return(&models.Version{
+ ID: models.ID(1),
+ ModelID: models.ID(1),
+ Model: &models.Model{
+ ID: models.ID(1),
+ Name: "model-1",
+ ProjectID: models.ID(1),
+ Project: mlp.Project{},
+ ExperimentID: 1,
+ Type: "pyfunc",
+ MlflowURL: "",
+ Endpoints: nil,
+ },
+ }, nil)
+ return svc
+ },
+ envService: func() *mocks.EnvironmentService {
+ svc := &mocks.EnvironmentService{}
+ svc.On("GetEnvironment", "dev").Return(&models.Environment{
+ ID: models.ID(1),
+ Name: "dev",
+ Cluster: "dev",
+ IsDefault: &trueBoolean,
+ Region: "id",
+ GcpProject: "dev-proj",
+ MaxCPU: "1",
+ MaxMemory: "1Gi",
+ }, nil)
+ return svc
+ },
+ endpointService: func() *mocks.EndpointsService {
+ svc := &mocks.EndpointsService{}
+ svc.On("FindByID", context.Background(), uuid).Return(&models.VersionEndpoint{
+ ID: uuid,
+ VersionID: models.ID(1),
+ VersionModelID: models.ID(1),
+ Status: models.EndpointPending,
+ ServiceName: "sample",
+ InferenceServiceName: "sample",
+ Namespace: "sample",
+ URL: "http://endpoint.svc",
+ MonitoringURL: "http://monitoring.com",
+ Environment: &models.Environment{
+ ID: models.ID(1),
+ Name: "dev",
+ Cluster: "dev",
+ IsDefault: &trueBoolean,
+ Region: "id",
+ GcpProject: "dev-proj",
+ MaxCPU: "1",
+ MaxMemory: "1Gi",
+ }, EnvironmentName: "dev",
+ Message: "",
+ ResourceRequest: nil,
+ EnvVars: models.EnvVars([]models.EnvVar{
+ {
+ Name: "WORKER",
+ Value: "1",
+ },
+ }),
+ }, nil)
+ return svc
+ },
+ expected: &Response{
+ code: http.StatusBadRequest,
+ data: Error{Message: "Error validating request: Updating endpoint status to running is not allowed when the endpoint is currently in the pending state"},
},
},
{
@@ -3949,7 +4062,7 @@ func TestUpdateEndpoint(t *testing.T) {
ID: uuid,
VersionID: models.ID(1),
VersionModelID: models.ID(1),
- Status: models.EndpointPending,
+ Status: models.EndpointRunning,
ServiceName: "sample",
InferenceServiceName: "sample",
Namespace: "sample",
@@ -4062,7 +4175,7 @@ func TestUpdateEndpoint(t *testing.T) {
ID: uuid,
VersionID: models.ID(1),
VersionModelID: models.ID(1),
- Status: models.EndpointPending,
+ Status: models.EndpointRunning,
ServiceName: "sample",
InferenceServiceName: "sample",
Namespace: "sample",
@@ -4239,7 +4352,7 @@ func TestUpdateEndpoint(t *testing.T) {
ID: uuid,
VersionID: models.ID(1),
VersionModelID: models.ID(1),
- Status: models.EndpointPending,
+ Status: models.EndpointRunning,
ServiceName: "sample",
InferenceServiceName: "sample",
Namespace: "sample",
@@ -4927,7 +5040,7 @@ func TestUpdateEndpoint(t *testing.T) {
},
expected: &Response{
code: http.StatusBadRequest,
- data: Error{Message: "Changing deployment type of a pending model is not allowed, please terminate it first."},
+ data: Error{Message: "Error validating request: Updating endpoint status to running is not allowed when the endpoint is currently in the pending state"},
},
},
}
diff --git a/api/client/model_environment.go b/api/client/model_environment.go
index 7bc65812b..1159a31f0 100644
--- a/api/client/model_environment.go
+++ b/api/client/model_environment.go
@@ -22,7 +22,7 @@ type Environment struct {
DefaultResourceRequest *ResourceRequest `json:"default_resource_request,omitempty"`
DefaultTransformerResourceRequest *ResourceRequest `json:"default_transformer_resource_request,omitempty"`
DefaultPredictionJobResourceRequest *PredictionJobResourceRequest `json:"default_prediction_job_resource_request,omitempty"`
- Gpus []Gpu `json:"gpus,omitempty"`
+ Gpus []GpuConfig `json:"gpus,omitempty"`
CreatedAt time.Time `json:"created_at,omitempty"`
UpdatedAt time.Time `json:"updated_at,omitempty"`
}
diff --git a/api/client/model_gpu.go b/api/client/model_gpu.go
deleted file mode 100644
index 4139b012b..000000000
--- a/api/client/model_gpu.go
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Merlin
- *
- * API Guide for accessing Merlin's model management, deployment, and serving functionalities
- *
- * API version: 0.14.0
- * Generated by: Swagger Codegen (https://github.com/swagger-api/swagger-codegen.git)
- */
-package client
-
-type Gpu struct {
- Values []string `json:"values,omitempty"`
- DisplayName string `json:"display_name,omitempty"`
- ResourceType string `json:"resource_type,omitempty"`
- NodeSelector map[string]string `json:"node_selector,omitempty"`
- MonthlyCostPerGpu float64 `json:"monthly_cost_per_gpu,omitempty"`
-}
diff --git a/api/client/model_gpu_config.go b/api/client/model_gpu_config.go
new file mode 100644
index 000000000..0e74d1dfa
--- /dev/null
+++ b/api/client/model_gpu_config.go
@@ -0,0 +1,19 @@
+/*
+ * Merlin
+ *
+ * API Guide for accessing Merlin's model management, deployment, and serving functionalities
+ *
+ * API version: 0.14.0
+ * Generated by: Swagger Codegen (https://github.com/swagger-api/swagger-codegen.git)
+ */
+package client
+
+type GpuConfig struct {
+ Name string `json:"name,omitempty"`
+ Values []string `json:"values,omitempty"`
+ ResourceType string `json:"resource_type,omitempty"`
+ NodeSelector map[string]string `json:"node_selector,omitempty"`
+ Tolerations []GpuToleration `json:"tolerations,omitempty"`
+ MinMonthlyCostPerGpu float64 `json:"min_monthly_cost_per_gpu,omitempty"`
+ MaxMonthlyCostPerGpu float64 `json:"max_monthly_cost_per_gpu,omitempty"`
+}
diff --git a/api/client/model_gpu_toleration.go b/api/client/model_gpu_toleration.go
new file mode 100644
index 000000000..c0c7a9c29
--- /dev/null
+++ b/api/client/model_gpu_toleration.go
@@ -0,0 +1,17 @@
+/*
+ * Merlin
+ *
+ * API Guide for accessing Merlin's model management, deployment, and serving functionalities
+ *
+ * API version: 0.14.0
+ * Generated by: Swagger Codegen (https://github.com/swagger-api/swagger-codegen.git)
+ */
+package client
+
+type GpuToleration struct {
+ Key string `json:"key,omitempty"`
+ Operator string `json:"operator,omitempty"`
+ Value string `json:"value,omitempty"`
+ Effect string `json:"effect,omitempty"`
+ TolerationSeconds int64 `json:"toleration_seconds,omitempty"`
+}
diff --git a/api/client/model_resource_request.go b/api/client/model_resource_request.go
index 985fc5f23..5c9383b9a 100644
--- a/api/client/model_resource_request.go
+++ b/api/client/model_resource_request.go
@@ -9,11 +9,10 @@
package client
type ResourceRequest struct {
- MinReplica int32 `json:"min_replica,omitempty"`
- MaxReplica int32 `json:"max_replica,omitempty"`
- CpuRequest string `json:"cpu_request,omitempty"`
- MemoryRequest string `json:"memory_request,omitempty"`
- GpuResourceType string `json:"gpu_resource_type,omitempty"`
- GpuRequest string `json:"gpu_request,omitempty"`
- GpuNodeSelector map[string]string `json:"gpu_node_selector,omitempty"`
+ MinReplica int32 `json:"min_replica,omitempty"`
+ MaxReplica int32 `json:"max_replica,omitempty"`
+ CpuRequest string `json:"cpu_request,omitempty"`
+ MemoryRequest string `json:"memory_request,omitempty"`
+ GpuName string `json:"gpu_name,omitempty"`
+ GpuRequest string `json:"gpu_request,omitempty"`
}
diff --git a/api/cluster/resource/templater.go b/api/cluster/resource/templater.go
index 622419757..41f995240 100644
--- a/api/cluster/resource/templater.go
+++ b/api/cluster/resource/templater.go
@@ -256,16 +256,23 @@ func createPredictorSpec(modelService *models.Service, config *config.Deployment
}
nodeSelector := map[string]string{}
- if !modelService.ResourceRequest.GPURequest.IsZero() {
- // Declare and initialize resourceType and resourceQuantity variables
- resourceType := corev1.ResourceName(modelService.ResourceRequest.GPUResourceType)
- resourceQuantity := modelService.ResourceRequest.GPURequest
-
- // Set the resourceType as the key in the maps, with resourceQuantity as the value
- resources.Requests[resourceType] = resourceQuantity
- resources.Limits[resourceType] = resourceQuantity
-
- nodeSelector = modelService.ResourceRequest.GPUNodeSelector
+ tolerations := []corev1.Toleration{}
+ if modelService.ResourceRequest.GPUName != "" && !modelService.ResourceRequest.GPURequest.IsZero() {
+ // Look up to the GPU resource type and quantity from DeploymentConfig
+ for _, gpuConfig := range config.GPUs {
+ if gpuConfig.Name == modelService.ResourceRequest.GPUName {
+ // Declare and initialize resourceType and resourceQuantity variables
+ resourceType := corev1.ResourceName(gpuConfig.ResourceType)
+ resourceQuantity := modelService.ResourceRequest.GPURequest
+
+ // Set the resourceType as the key in the maps, with resourceQuantity as the value
+ resources.Requests[resourceType] = resourceQuantity
+ resources.Limits[resourceType] = resourceQuantity
+
+ nodeSelector = gpuConfig.NodeSelector
+ tolerations = gpuConfig.Tolerations
+ }
+ }
}
// liveness probe config. if env var to disable != true or not set, it will default to enabled
@@ -360,13 +367,17 @@ func createPredictorSpec(modelService *models.Service, config *config.Deployment
},
}
case models.ModelTypeCustom:
- predictorSpec = createCustomPredictorSpec(modelService, resources, nodeSelector)
+ predictorSpec = createCustomPredictorSpec(modelService, resources, nodeSelector, tolerations)
}
if len(nodeSelector) > 0 {
predictorSpec.NodeSelector = nodeSelector
}
+ if len(tolerations) > 0 {
+ predictorSpec.Tolerations = tolerations
+ }
+
var loggerSpec *kservev1beta1.LoggerSpec
if modelService.Logger != nil && modelService.Logger.Model != nil && modelService.Logger.Model.Enabled {
logger := modelService.Logger
@@ -802,7 +813,7 @@ func createDefaultPredictorEnvVars(modelService *models.Service) models.EnvVars
return defaultEnvVars
}
-func createCustomPredictorSpec(modelService *models.Service, resources corev1.ResourceRequirements, nodeSelector map[string]string) kservev1beta1.PredictorSpec {
+func createCustomPredictorSpec(modelService *models.Service, resources corev1.ResourceRequirements, nodeSelector map[string]string, tolerations []corev1.Toleration) kservev1beta1.PredictorSpec {
envVars := modelService.EnvVars
// Add default env var (Overwrite by user not allowed)
@@ -846,6 +857,10 @@ func createCustomPredictorSpec(modelService *models.Service, resources corev1.Re
spec.NodeSelector = nodeSelector
}
+ if len(tolerations) > 0 {
+ spec.Tolerations = tolerations
+ }
+
return spec
}
diff --git a/api/cluster/resource/templater_gpu_test.go b/api/cluster/resource/templater_gpu_test.go
index cf840de06..ed6c05b2f 100644
--- a/api/cluster/resource/templater_gpu_test.go
+++ b/api/cluster/resource/templater_gpu_test.go
@@ -22,6 +22,35 @@ import (
)
var (
+ defaultGPUNodeSelector = map[string]string{"cloud.google.com/gke-accelerator": "nvidia-tesla-p4"}
+
+ defaultGPUTolerations = []corev1.Toleration{
+ {
+ Key: "caraml/nvidia-tesla-p4",
+ Operator: corev1.TolerationOpEqual,
+ Value: "enabled",
+ Effect: corev1.TaintEffectNoSchedule,
+ },
+ {
+ Key: "nvidia.com/gpu",
+ Operator: corev1.TolerationOpEqual,
+ Value: "present",
+ Effect: corev1.TaintEffectNoSchedule,
+ },
+ }
+
+ defaultGPUsConfig = []config.GPUConfig{
+ {
+ Name: "NVIDIA P4",
+ Values: []string{"None", "1", "2", "4"},
+ ResourceType: "nvidia.com/gpu",
+ NodeSelector: defaultGPUNodeSelector,
+ Tolerations: defaultGPUTolerations,
+ MinMonthlyCostPerGPU: 332.15,
+ MaxMonthlyCostPerGPU: 332.15,
+ },
+ }
+
expDefaultModelResourceRequestsWithGPU = corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: defaultModelResourceRequests.CPURequest,
@@ -67,13 +96,12 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
},
Protocol: protocol.HttpJson,
ResourceRequest: &models.ResourceRequest{
- MinReplica: 1,
- MaxReplica: 2,
- CPURequest: resource.MustParse("500m"),
- MemoryRequest: resource.MustParse("500Mi"),
- GPURequest: resource.MustParse("1"),
- GPUResourceType: "nvidia.com/gpu",
- GPUNodeSelector: map[string]string{"cloud.google.com/gke-accelerator": "nvidia-tesla-p4"},
+ MinReplica: 1,
+ MaxReplica: 2,
+ CPURequest: resource.MustParse("500m"),
+ MemoryRequest: resource.MustParse("500Mi"),
+ GPUName: "NVIDIA P4",
+ GPURequest: resource.MustParse("1"),
},
}
@@ -142,7 +170,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
MaxReplicas: defaultModelResourceRequests.MaxReplica,
},
PodSpec: kservev1beta1.PodSpec{
- NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector,
+ NodeSelector: defaultGPUNodeSelector,
+ Tolerations: defaultGPUTolerations,
},
},
},
@@ -214,7 +243,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
MaxReplicas: defaultModelResourceRequests.MaxReplica,
},
PodSpec: kservev1beta1.PodSpec{
- NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector,
+ NodeSelector: defaultGPUNodeSelector,
+ Tolerations: defaultGPUTolerations,
},
},
},
@@ -272,7 +302,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
MaxReplicas: defaultModelResourceRequests.MaxReplica,
},
PodSpec: kservev1beta1.PodSpec{
- NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector,
+ NodeSelector: defaultGPUNodeSelector,
+ Tolerations: defaultGPUTolerations,
},
},
},
@@ -330,7 +361,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
MaxReplicas: defaultModelResourceRequests.MaxReplica,
},
PodSpec: kservev1beta1.PodSpec{
- NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector,
+ NodeSelector: defaultGPUNodeSelector,
+ Tolerations: defaultGPUTolerations,
},
},
},
@@ -385,7 +417,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
MaxReplicas: defaultModelResourceRequests.MaxReplica,
},
PodSpec: kservev1beta1.PodSpec{
- NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector,
+ NodeSelector: defaultGPUNodeSelector,
+ Tolerations: defaultGPUTolerations,
},
},
},
@@ -442,7 +475,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
MaxReplicas: defaultModelResourceRequests.MaxReplica,
},
PodSpec: kservev1beta1.PodSpec{
- NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector,
+ NodeSelector: defaultGPUNodeSelector,
+ Tolerations: defaultGPUTolerations,
},
},
},
@@ -499,7 +533,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
MaxReplicas: defaultModelResourceRequests.MaxReplica,
},
PodSpec: kservev1beta1.PodSpec{
- NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector,
+ NodeSelector: defaultGPUNodeSelector,
+ Tolerations: defaultGPUTolerations,
},
},
},
@@ -556,7 +591,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
MaxReplicas: defaultModelResourceRequests.MaxReplica,
},
PodSpec: kservev1beta1.PodSpec{
- NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector,
+ NodeSelector: defaultGPUNodeSelector,
+ Tolerations: defaultGPUTolerations,
},
},
},
@@ -612,7 +648,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
Resources: expDefaultModelResourceRequestsWithGPU,
},
},
- NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector,
+ NodeSelector: defaultGPUNodeSelector,
+ Tolerations: defaultGPUTolerations,
},
ComponentExtensionSpec: kservev1beta1.ComponentExtensionSpec{
MinReplicas: &defaultModelResourceRequests.MinReplica,
@@ -672,7 +709,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
Resources: expDefaultModelResourceRequestsWithGPU,
},
},
- NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector,
+ NodeSelector: defaultGPUNodeSelector,
+ Tolerations: defaultGPUTolerations,
},
ComponentExtensionSpec: kservev1beta1.ComponentExtensionSpec{
MinReplicas: &defaultModelResourceRequests.MinReplica,
@@ -738,7 +776,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
Resources: expDefaultModelResourceRequestsWithGPU,
},
},
- NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector,
+ NodeSelector: defaultGPUNodeSelector,
+ Tolerations: defaultGPUTolerations,
},
ComponentExtensionSpec: kservev1beta1.ComponentExtensionSpec{
MinReplicas: &defaultModelResourceRequests.MinReplica,
@@ -805,7 +844,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
},
},
},
- NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector,
+ NodeSelector: defaultGPUNodeSelector,
+ Tolerations: defaultGPUTolerations,
},
ComponentExtensionSpec: kservev1beta1.ComponentExtensionSpec{
MinReplicas: &modelSvc.ResourceRequest.MinReplica,
@@ -874,7 +914,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
MaxReplicas: defaultModelResourceRequests.MaxReplica,
},
PodSpec: kservev1beta1.PodSpec{
- NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector,
+ NodeSelector: defaultGPUNodeSelector,
+ Tolerations: defaultGPUTolerations,
},
},
},
@@ -961,7 +1002,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
MaxReplicas: defaultModelResourceRequests.MaxReplica,
},
PodSpec: kservev1beta1.PodSpec{
- NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector,
+ NodeSelector: defaultGPUNodeSelector,
+ Tolerations: defaultGPUTolerations,
},
},
},
@@ -1026,7 +1068,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
MaxReplicas: defaultModelResourceRequests.MaxReplica,
},
PodSpec: kservev1beta1.PodSpec{
- NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector,
+ NodeSelector: defaultGPUNodeSelector,
+ Tolerations: defaultGPUTolerations,
},
},
},
@@ -1091,7 +1134,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
MaxReplicas: modelSvc.ResourceRequest.MaxReplica,
},
PodSpec: kservev1beta1.PodSpec{
- NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector,
+ NodeSelector: defaultGPUNodeSelector,
+ Tolerations: defaultGPUTolerations,
},
},
},
@@ -1156,7 +1200,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
MaxReplicas: defaultModelResourceRequests.MaxReplica,
},
PodSpec: kservev1beta1.PodSpec{
- NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector,
+ NodeSelector: defaultGPUNodeSelector,
+ Tolerations: defaultGPUTolerations,
},
},
},
@@ -1221,7 +1266,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
MaxReplicas: defaultModelResourceRequests.MaxReplica,
},
PodSpec: kservev1beta1.PodSpec{
- NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector,
+ NodeSelector: defaultGPUNodeSelector,
+ Tolerations: defaultGPUTolerations,
},
},
},
@@ -1279,7 +1325,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
MaxReplicas: defaultModelResourceRequests.MaxReplica,
},
PodSpec: kservev1beta1.PodSpec{
- NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector,
+ NodeSelector: defaultGPUNodeSelector,
+ Tolerations: defaultGPUTolerations,
},
},
},
@@ -1336,7 +1383,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
LivenessProbe: probeConfigUPI,
},
},
- NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector,
+ NodeSelector: defaultGPUNodeSelector,
+ Tolerations: defaultGPUTolerations,
},
ComponentExtensionSpec: kservev1beta1.ComponentExtensionSpec{
MinReplicas: &defaultModelResourceRequests.MinReplica,
@@ -1398,7 +1446,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
MaxReplicas: defaultModelResourceRequests.MaxReplica,
},
PodSpec: kservev1beta1.PodSpec{
- NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector,
+ NodeSelector: defaultGPUNodeSelector,
+ Tolerations: defaultGPUTolerations,
},
},
},
@@ -1460,7 +1509,8 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
Ports: grpcContainerPorts,
},
},
- NodeSelector: modelSvc.ResourceRequest.GPUNodeSelector,
+ NodeSelector: defaultGPUNodeSelector,
+ Tolerations: defaultGPUTolerations,
},
ComponentExtensionSpec: kservev1beta1.ComponentExtensionSpec{
MinReplicas: &defaultModelResourceRequests.MinReplica,
@@ -1479,6 +1529,7 @@ func TestCreateInferenceServiceSpecWithGPU(t *testing.T) {
DefaultTransformerResourceRequests: defaultTransformerResourceRequests,
QueueResourcePercentage: tt.resourcePercentage,
PyfuncGRPCOptions: "{}",
+ GPUs: defaultGPUsConfig,
}
tpl := NewInferenceServiceTemplater(standardTransformerConfig)
diff --git a/api/config/environment.go b/api/config/environment.go
index d7658cd05..b19cc3e30 100644
--- a/api/config/environment.go
+++ b/api/config/environment.go
@@ -113,19 +113,26 @@ type ResourceRequestConfig struct {
}
type GPUConfig struct {
+ // Name is used as the key to identify the GPU configuration.
+ // It also specifies how the accelerator type will be written in the UI.
+ // Example: "NVIDIA T4"
+ Name string `yaml:"name"`
// Values limits how many GPUs can be requested by users.
- // Example: "none", "1", "2", "4"
+ // Example: "None", "1", "2", "4"
Values []string `yaml:"values"`
- // Specifies how the accelerator type will be written in the UI.
- // Example: "NVIDIA T4"
- DisplayName string `yaml:"display_name"`
// Specifies how the accelerator type will be translated to
// K8s resource type. Example: nvidia.com/gpu
ResourceType string `yaml:"resource_type"`
// To deploy the models on a specific GPU node.
NodeSelector map[string]string `yaml:"node_selector"`
+ // To deploy the models on a specific GPU node via taints and tolerations.
+ Tolerations []corev1.Toleration `yaml:"tolerations"`
+ // MinMonthlyCostPerGPU is the minimum monthly cost per GPU, for example, if you enable time-sharing GPUs with 8 max shared clients,
+ // the minimum monthly cost per GPU is max_monthly_cost_per_gpu divided by 8.
+ // MaxMonthlyCostPerGPU is the maximum monthly cost if you use the whole GPU.
// https://cloud.google.com/compute/gpus-pricing#other-gpu-models
- MonthlyCostPerGPU float64 `yaml:"monthly_cost_per_gpu"`
+ MinMonthlyCostPerGPU float64 `yaml:"min_monthly_cost_per_gpu"`
+ MaxMonthlyCostPerGPU float64 `yaml:"max_monthly_cost_per_gpu"`
}
func InitEnvironmentConfigs(path string) ([]*EnvironmentConfig, error) {
diff --git a/api/config/environment_test.go b/api/config/environment_test.go
index fec59719f..24db43b33 100644
--- a/api/config/environment_test.go
+++ b/api/config/environment_test.go
@@ -186,13 +186,50 @@ func TestGPUsConfig(t *testing.T) {
envConfigPath: "./testdata/valid-environment-1.yaml",
expectedGPUsConfig: []GPUConfig{
{
- Values: []string{"none", "1"},
- DisplayName: "NVIDIA T4",
+ Name: "NVIDIA T4",
+ Values: []string{"None", "1"},
ResourceType: "nvidia.com/gpu",
NodeSelector: map[string]string{
"cloud.google.com/gke-accelerator": "nvidia-tesla-t4",
},
- MonthlyCostPerGPU: 189.07,
+ MinMonthlyCostPerGPU: 189.07,
+ MaxMonthlyCostPerGPU: 189.07,
+ },
+ {
+ Name: "NVIDIA T4 with Time Sharing",
+ Values: []string{"None", "1"},
+ ResourceType: "nvidia.com/gpu",
+ NodeSelector: map[string]string{
+ "cloud.google.com/gke-accelerator": "nvidia-tesla-t4",
+ "cloud.google.com/gke-max-shared-clients-per-gpu": "8",
+ "cloud.google.com/gke-gpu-sharing-strategy": "time-sharing",
+ },
+ MinMonthlyCostPerGPU: 23.63,
+ MaxMonthlyCostPerGPU: 189.07,
+ },
+ {
+ Name: "NVIDIA P4",
+ Values: []string{"None", "1", "2"},
+ ResourceType: "nvidia.com/gpu",
+ NodeSelector: map[string]string{
+ "cloud.google.com/gke-accelerator": "nvidia-tesla-p4",
+ },
+ Tolerations: []corev1.Toleration{
+ {
+ Key: "caraml/nvidia-tesla-p4",
+ Operator: corev1.TolerationOpEqual,
+ Value: "enabled",
+ Effect: corev1.TaintEffectNoSchedule,
+ },
+ {
+ Key: "nvidia.com/gpu",
+ Operator: corev1.TolerationOpEqual,
+ Value: "present",
+ Effect: corev1.TaintEffectNoSchedule,
+ },
+ },
+ MinMonthlyCostPerGPU: 332.15,
+ MaxMonthlyCostPerGPU: 332.15,
},
},
},
diff --git a/api/config/testdata/valid-environment-1.yaml b/api/config/testdata/valid-environment-1.yaml
index 0fbf8d83c..1ea25a60c 100644
--- a/api/config/testdata/valid-environment-1.yaml
+++ b/api/config/testdata/valid-environment-1.yaml
@@ -41,9 +41,35 @@
interactiveMode: IfAvailable
provideClusterInfo: true
gpus:
- - values: ["none", "1"]
- display_name: "NVIDIA T4"
+ - name: "NVIDIA T4"
+ values: ["None", "1"]
resource_type: "nvidia.com/gpu"
node_selector:
"cloud.google.com/gke-accelerator": "nvidia-tesla-t4"
- monthly_cost_per_gpu: 189.07
+ min_monthly_cost_per_gpu: 189.07
+ max_monthly_cost_per_gpu: 189.07
+ - name: "NVIDIA T4 with Time Sharing"
+ values: ["None", "1"]
+ resource_type: "nvidia.com/gpu"
+ node_selector:
+ "cloud.google.com/gke-accelerator": "nvidia-tesla-t4"
+ "cloud.google.com/gke-max-shared-clients-per-gpu": "8"
+ "cloud.google.com/gke-gpu-sharing-strategy": "time-sharing"
+ min_monthly_cost_per_gpu: 23.63
+ max_monthly_cost_per_gpu: 189.07
+ - name: "NVIDIA P4"
+ values: ["None", "1", "2"]
+ resource_type: "nvidia.com/gpu"
+ node_selector:
+ "cloud.google.com/gke-accelerator": "nvidia-tesla-p4"
+ tolerations:
+ - key: "caraml/nvidia-tesla-p4"
+ operator: "Equal"
+ value: "enabled"
+ effect: "NoSchedule"
+ - key: "nvidia.com/gpu"
+ operator: "Equal"
+ value: "present"
+ effect: "NoSchedule"
+ min_monthly_cost_per_gpu: 332.15
+ max_monthly_cost_per_gpu: 332.15
diff --git a/api/models/gpu.go b/api/models/gpu.go
index ffa15c49e..26b9f3915 100644
--- a/api/models/gpu.go
+++ b/api/models/gpu.go
@@ -5,23 +5,32 @@ import (
"encoding/json"
"errors"
+ corev1 "k8s.io/api/core/v1"
+
"github.com/caraml-dev/merlin/config"
)
type GPU struct {
+ // Name is used as the key to identify the GPU configuration.
+ // It also specifies how the accelerator type will be written in the UI.
+ // Example: "NVIDIA T4"
+ Name string `json:"name"`
// Values limits how many GPUs can be requested by users.
// Example: "none", "1", "2", "4"
Values []string `json:"values"`
- // Specifies how the accelerator type will be written in the UI.
- // Example: "NVIDIA T4"
- DisplayName string `json:"display_name"`
// Specifies how the accelerator type will be translated to
// K8s resource type. Example: nvidia.com/gpu
ResourceType string `json:"resource_type"`
// To deploy the models on a specific GPU node.
- NodeSelector map[string]string `json:"node_selector"`
+ NodeSelector map[string]string `json:"node_selector,omitempty"`
+ // To deploy the models on a specific GPU node via taints and tolerations.
+ Tolerations []corev1.Toleration `json:"tolerations,omitempty"`
+ // MinMonthlyCostPerGPU is the minimum monthly cost per GPU, for example, if you enable time-sharing GPUs with 8 max shared clients,
+ // the minimum monthly cost per GPU is max_monthly_cost_per_gpu divided by 8.
+ // MaxMonthlyCostPerGPU is the maximum monthly cost if you use the whole GPU.
// https://cloud.google.com/compute/gpus-pricing#other-gpu-models
- MonthlyCostPerGPU float64 `json:"monthly_cost_per_gpu"`
+ MinMonthlyCostPerGPU float64 `json:"min_monthly_cost_per_gpu"`
+ MaxMonthlyCostPerGPU float64 `json:"max_monthly_cost_per_gpu"`
}
type GPUs []GPU
@@ -45,11 +54,13 @@ func ParseGPUsConfig(configGPUs []config.GPUConfig) GPUs {
for _, configGPU := range configGPUs {
gpu := GPU{
- Values: configGPU.Values,
- DisplayName: configGPU.DisplayName,
- ResourceType: configGPU.ResourceType,
- NodeSelector: configGPU.NodeSelector,
- MonthlyCostPerGPU: configGPU.MonthlyCostPerGPU,
+ Name: configGPU.Name,
+ Values: configGPU.Values,
+ ResourceType: configGPU.ResourceType,
+ NodeSelector: configGPU.NodeSelector,
+ Tolerations: configGPU.Tolerations,
+ MinMonthlyCostPerGPU: configGPU.MinMonthlyCostPerGPU,
+ MaxMonthlyCostPerGPU: configGPU.MaxMonthlyCostPerGPU,
}
gpus = append(gpus, gpu)
}
diff --git a/api/models/gpu_test.go b/api/models/gpu_test.go
new file mode 100644
index 000000000..179f59b9b
--- /dev/null
+++ b/api/models/gpu_test.go
@@ -0,0 +1,86 @@
+package models
+
+import (
+ "reflect"
+ "testing"
+
+ corev1 "k8s.io/api/core/v1"
+
+ "github.com/caraml-dev/merlin/config"
+)
+
+func TestParseGPUsConfig(t *testing.T) {
+ type args struct {
+ configGPUs []config.GPUConfig
+ }
+ tests := []struct {
+ name string
+ args args
+ want GPUs
+ }{
+ {
+ name: "successful parsing",
+ args: args{
+ configGPUs: []config.GPUConfig{
+ {
+ Name: "NVIDIA P4",
+ Values: []string{"None", "1", "2"},
+ ResourceType: "nvidia.com/gpu",
+ NodeSelector: map[string]string{
+ "cloud.google.com/gke-accelerator": "nvidia-tesla-p4",
+ },
+ Tolerations: []corev1.Toleration{
+ {
+ Key: "caraml/nvidia-tesla-p4",
+ Operator: corev1.TolerationOpEqual,
+ Value: "enabled",
+ Effect: "NoSchedule",
+ },
+ {
+ Key: "nvidia.com/gpu",
+ Operator: corev1.TolerationOpEqual,
+ Value: "present",
+ Effect: "NoSchedule",
+ },
+ },
+ MinMonthlyCostPerGPU: 332.15,
+ MaxMonthlyCostPerGPU: 332.15,
+ },
+ },
+ },
+ want: GPUs{
+ {
+ Name: "NVIDIA P4",
+ Values: []string{"None", "1", "2"},
+ ResourceType: "nvidia.com/gpu",
+ NodeSelector: map[string]string{
+ "cloud.google.com/gke-accelerator": "nvidia-tesla-p4",
+ },
+ Tolerations: []corev1.Toleration{
+ {
+ Key: "caraml/nvidia-tesla-p4",
+ Operator: corev1.TolerationOpEqual,
+ Value: "enabled",
+ Effect: "NoSchedule",
+ },
+ {
+ Key: "nvidia.com/gpu",
+ Operator: corev1.TolerationOpEqual,
+ Value: "present",
+ Effect: "NoSchedule",
+ },
+ },
+ MinMonthlyCostPerGPU: 332.15,
+ MaxMonthlyCostPerGPU: 332.15,
+ },
+ },
+ },
+ }
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ if got := ParseGPUsConfig(tt.args.configGPUs); !reflect.DeepEqual(got, tt.want) {
+ t.Errorf("ParseGPUsConfig() = %v, want %v", got, tt.want)
+ }
+ })
+ }
+}
diff --git a/api/models/resource_request.go b/api/models/resource_request.go
index b537f1914..222c4c2da 100644
--- a/api/models/resource_request.go
+++ b/api/models/resource_request.go
@@ -31,12 +31,11 @@ type ResourceRequest struct {
CPURequest resource.Quantity `json:"cpu_request"`
// Memory request of inference service
MemoryRequest resource.Quantity `json:"memory_request"`
- // GPU resource type (nvidia.com/gpu or amd.com/gpu)
- GPUResourceType string `json:"gpu_resource_type"`
+
+ // GPU name
+ GPUName string `json:"gpu_name,omitempty"`
// GPU Quantity requests
- GPURequest resource.Quantity `json:"gpu_request"`
- // GPU Node selector
- GPUNodeSelector map[string]string `json:"gpu_node_selector"`
+ GPURequest resource.Quantity `json:"gpu_request,omitempty"`
}
func (r ResourceRequest) Value() (driver.Value, error) {
diff --git a/api/service/environment_service_test.go b/api/service/environment_service_test.go
index 6a2cb2ae5..46db6da95 100644
--- a/api/service/environment_service_test.go
+++ b/api/service/environment_service_test.go
@@ -64,13 +64,14 @@ func TestSave(t *testing.T) {
},
GPUs: models.GPUs{
{
+ Name: "NVIDIA T4",
Values: []string{"none", "1"},
- DisplayName: "NVIDIA T4",
ResourceType: "nvidia.com/gpu",
NodeSelector: map[string]string{
"cloud.google.com/gke-accelerator": "nvidia-tesla-t4",
},
- MonthlyCostPerGPU: 189.07,
+ MinMonthlyCostPerGPU: 189.07,
+ MaxMonthlyCostPerGPU: 189.07,
},
},
},
diff --git a/docs/connecting-to-merlin/python-sdk.md b/docs/connecting-to-merlin/python-sdk.md
index 150944f8c..d0be7bb7f 100644
--- a/docs/connecting-to-merlin/python-sdk.md
+++ b/docs/connecting-to-merlin/python-sdk.md
@@ -15,6 +15,10 @@ from merlin.model import ModelType
# Connect to an existing Merlin deployment
merlin.set_url("merlin.example.com")
+# Set the active model to the name given by parameter, if the model with the given name is not found, a new model will
+# be created.
+merlin.set_model("example-model", ModelType.PYFUNC)
+
# Ensure that you're connected by printing out some Model Endpoints
merlin.list_model_endpoints()
```
diff --git a/python/sdk/client/__init__.py b/python/sdk/client/__init__.py
index ebba7064b..9aa7d5af9 100644
--- a/python/sdk/client/__init__.py
+++ b/python/sdk/client/__init__.py
@@ -42,7 +42,8 @@
from client.models.environment import Environment
from client.models.file_format import FileFormat
from client.models.free_form_object import FreeFormObject
-from client.models.gpu import GPU
+from client.models.gpu_config import GPUConfig
+from client.models.gpu_toleration import GPUToleration
from client.models.label import Label
from client.models.logger import Logger
from client.models.logger_config import LoggerConfig
diff --git a/python/sdk/client/models/__init__.py b/python/sdk/client/models/__init__.py
index 597e406ab..d4dcc14f2 100644
--- a/python/sdk/client/models/__init__.py
+++ b/python/sdk/client/models/__init__.py
@@ -26,7 +26,8 @@
from client.models.environment import Environment
from client.models.file_format import FileFormat
from client.models.free_form_object import FreeFormObject
-from client.models.gpu import GPU
+from client.models.gpu_config import GPUConfig
+from client.models.gpu_toleration import GPUToleration
from client.models.label import Label
from client.models.logger import Logger
from client.models.logger_config import LoggerConfig
diff --git a/python/sdk/client/models/environment.py b/python/sdk/client/models/environment.py
index 18d1d07c0..74fe16193 100644
--- a/python/sdk/client/models/environment.py
+++ b/python/sdk/client/models/environment.py
@@ -37,7 +37,7 @@ class Environment(object):
'default_resource_request': 'ResourceRequest',
'default_transformer_resource_request': 'ResourceRequest',
'default_prediction_job_resource_request': 'PredictionJobResourceRequest',
- 'gpus': 'list[GPU]',
+ 'gpus': 'list[GPUConfig]',
'created_at': 'datetime',
'updated_at': 'datetime'
}
@@ -293,7 +293,7 @@ def gpus(self):
:return: The gpus of this Environment. # noqa: E501
- :rtype: list[GPU]
+ :rtype: list[GPUConfig]
"""
return self._gpus
@@ -303,7 +303,7 @@ def gpus(self, gpus):
:param gpus: The gpus of this Environment. # noqa: E501
- :type: list[GPU]
+ :type: list[GPUConfig]
"""
self._gpus = gpus
diff --git a/python/sdk/client/models/gpu.py b/python/sdk/client/models/gpu.py
deleted file mode 100644
index 632320a9c..000000000
--- a/python/sdk/client/models/gpu.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# coding: utf-8
-
-"""
- Merlin
-
- API Guide for accessing Merlin's model management, deployment, and serving functionalities # noqa: E501
-
- OpenAPI spec version: 0.14.0
-
- Generated by: https://github.com/swagger-api/swagger-codegen.git
-"""
-
-import pprint
-import re # noqa: F401
-
-import six
-
-class GPU(object):
- """NOTE: This class is auto generated by the swagger code generator program.
-
- Do not edit the class manually.
- """
- """
- Attributes:
- swagger_types (dict): The key is attribute name
- and the value is attribute type.
- attribute_map (dict): The key is attribute name
- and the value is json key in definition.
- """
- swagger_types = {
- 'values': 'list[str]',
- 'display_name': 'str',
- 'resource_type': 'str',
- 'node_selector': 'dict(str, str)',
- 'monthly_cost_per_gpu': 'float'
- }
-
- attribute_map = {
- 'values': 'values',
- 'display_name': 'display_name',
- 'resource_type': 'resource_type',
- 'node_selector': 'node_selector',
- 'monthly_cost_per_gpu': 'monthly_cost_per_gpu'
- }
-
- def __init__(self, values=None, display_name=None, resource_type=None, node_selector=None, monthly_cost_per_gpu=None): # noqa: E501
- """GPU - a model defined in Swagger""" # noqa: E501
- self._values = None
- self._display_name = None
- self._resource_type = None
- self._node_selector = None
- self._monthly_cost_per_gpu = None
- self.discriminator = None
- if values is not None:
- self.values = values
- if display_name is not None:
- self.display_name = display_name
- if resource_type is not None:
- self.resource_type = resource_type
- if node_selector is not None:
- self.node_selector = node_selector
- if monthly_cost_per_gpu is not None:
- self.monthly_cost_per_gpu = monthly_cost_per_gpu
-
- @property
- def values(self):
- """Gets the values of this GPU. # noqa: E501
-
-
- :return: The values of this GPU. # noqa: E501
- :rtype: list[str]
- """
- return self._values
-
- @values.setter
- def values(self, values):
- """Sets the values of this GPU.
-
-
- :param values: The values of this GPU. # noqa: E501
- :type: list[str]
- """
-
- self._values = values
-
- @property
- def display_name(self):
- """Gets the display_name of this GPU. # noqa: E501
-
-
- :return: The display_name of this GPU. # noqa: E501
- :rtype: str
- """
- return self._display_name
-
- @display_name.setter
- def display_name(self, display_name):
- """Sets the display_name of this GPU.
-
-
- :param display_name: The display_name of this GPU. # noqa: E501
- :type: str
- """
-
- self._display_name = display_name
-
- @property
- def resource_type(self):
- """Gets the resource_type of this GPU. # noqa: E501
-
-
- :return: The resource_type of this GPU. # noqa: E501
- :rtype: str
- """
- return self._resource_type
-
- @resource_type.setter
- def resource_type(self, resource_type):
- """Sets the resource_type of this GPU.
-
-
- :param resource_type: The resource_type of this GPU. # noqa: E501
- :type: str
- """
-
- self._resource_type = resource_type
-
- @property
- def node_selector(self):
- """Gets the node_selector of this GPU. # noqa: E501
-
-
- :return: The node_selector of this GPU. # noqa: E501
- :rtype: dict(str, str)
- """
- return self._node_selector
-
- @node_selector.setter
- def node_selector(self, node_selector):
- """Sets the node_selector of this GPU.
-
-
- :param node_selector: The node_selector of this GPU. # noqa: E501
- :type: dict(str, str)
- """
-
- self._node_selector = node_selector
-
- @property
- def monthly_cost_per_gpu(self):
- """Gets the monthly_cost_per_gpu of this GPU. # noqa: E501
-
-
- :return: The monthly_cost_per_gpu of this GPU. # noqa: E501
- :rtype: float
- """
- return self._monthly_cost_per_gpu
-
- @monthly_cost_per_gpu.setter
- def monthly_cost_per_gpu(self, monthly_cost_per_gpu):
- """Sets the monthly_cost_per_gpu of this GPU.
-
-
- :param monthly_cost_per_gpu: The monthly_cost_per_gpu of this GPU. # noqa: E501
- :type: float
- """
-
- self._monthly_cost_per_gpu = monthly_cost_per_gpu
-
- def to_dict(self):
- """Returns the model properties as a dict"""
- result = {}
-
- for attr, _ in six.iteritems(self.swagger_types):
- value = getattr(self, attr)
- if isinstance(value, list):
- result[attr] = list(map(
- lambda x: x.to_dict() if hasattr(x, "to_dict") else x,
- value
- ))
- elif hasattr(value, "to_dict"):
- result[attr] = value.to_dict()
- elif isinstance(value, dict):
- result[attr] = dict(map(
- lambda item: (item[0], item[1].to_dict())
- if hasattr(item[1], "to_dict") else item,
- value.items()
- ))
- else:
- result[attr] = value
- if issubclass(GPU, dict):
- for key, value in self.items():
- result[key] = value
-
- return result
-
- def to_str(self):
- """Returns the string representation of the model"""
- return pprint.pformat(self.to_dict())
-
- def __repr__(self):
- """For `print` and `pprint`"""
- return self.to_str()
-
- def __eq__(self, other):
- """Returns true if both objects are equal"""
- if not isinstance(other, GPU):
- return False
-
- return self.__dict__ == other.__dict__
-
- def __ne__(self, other):
- """Returns true if both objects are not equal"""
- return not self == other
diff --git a/python/sdk/client/models/gpu_config.py b/python/sdk/client/models/gpu_config.py
new file mode 100644
index 000000000..a75ecf4ab
--- /dev/null
+++ b/python/sdk/client/models/gpu_config.py
@@ -0,0 +1,266 @@
+# coding: utf-8
+
+"""
+ Merlin
+
+ API Guide for accessing Merlin's model management, deployment, and serving functionalities # noqa: E501
+
+ OpenAPI spec version: 0.14.0
+
+ Generated by: https://github.com/swagger-api/swagger-codegen.git
+"""
+
+import pprint
+import re # noqa: F401
+
+import six
+
+class GPUConfig(object):
+ """NOTE: This class is auto generated by the swagger code generator program.
+
+ Do not edit the class manually.
+ """
+ """
+ Attributes:
+ swagger_types (dict): The key is attribute name
+ and the value is attribute type.
+ attribute_map (dict): The key is attribute name
+ and the value is json key in definition.
+ """
+ swagger_types = {
+ 'name': 'str',
+ 'values': 'list[str]',
+ 'resource_type': 'str',
+ 'node_selector': 'dict(str, str)',
+ 'tolerations': 'list[GPUToleration]',
+ 'min_monthly_cost_per_gpu': 'float',
+ 'max_monthly_cost_per_gpu': 'float'
+ }
+
+ attribute_map = {
+ 'name': 'name',
+ 'values': 'values',
+ 'resource_type': 'resource_type',
+ 'node_selector': 'node_selector',
+ 'tolerations': 'tolerations',
+ 'min_monthly_cost_per_gpu': 'min_monthly_cost_per_gpu',
+ 'max_monthly_cost_per_gpu': 'max_monthly_cost_per_gpu'
+ }
+
+ def __init__(self, name=None, values=None, resource_type=None, node_selector=None, tolerations=None, min_monthly_cost_per_gpu=None, max_monthly_cost_per_gpu=None): # noqa: E501
+ """GPUConfig - a model defined in Swagger""" # noqa: E501
+ self._name = None
+ self._values = None
+ self._resource_type = None
+ self._node_selector = None
+ self._tolerations = None
+ self._min_monthly_cost_per_gpu = None
+ self._max_monthly_cost_per_gpu = None
+ self.discriminator = None
+ if name is not None:
+ self.name = name
+ if values is not None:
+ self.values = values
+ if resource_type is not None:
+ self.resource_type = resource_type
+ if node_selector is not None:
+ self.node_selector = node_selector
+ if tolerations is not None:
+ self.tolerations = tolerations
+ if min_monthly_cost_per_gpu is not None:
+ self.min_monthly_cost_per_gpu = min_monthly_cost_per_gpu
+ if max_monthly_cost_per_gpu is not None:
+ self.max_monthly_cost_per_gpu = max_monthly_cost_per_gpu
+
+ @property
+ def name(self):
+ """Gets the name of this GPUConfig. # noqa: E501
+
+
+ :return: The name of this GPUConfig. # noqa: E501
+ :rtype: str
+ """
+ return self._name
+
+ @name.setter
+ def name(self, name):
+ """Sets the name of this GPUConfig.
+
+
+ :param name: The name of this GPUConfig. # noqa: E501
+ :type: str
+ """
+
+ self._name = name
+
+ @property
+ def values(self):
+ """Gets the values of this GPUConfig. # noqa: E501
+
+
+ :return: The values of this GPUConfig. # noqa: E501
+ :rtype: list[str]
+ """
+ return self._values
+
+ @values.setter
+ def values(self, values):
+ """Sets the values of this GPUConfig.
+
+
+ :param values: The values of this GPUConfig. # noqa: E501
+ :type: list[str]
+ """
+
+ self._values = values
+
+ @property
+ def resource_type(self):
+ """Gets the resource_type of this GPUConfig. # noqa: E501
+
+
+ :return: The resource_type of this GPUConfig. # noqa: E501
+ :rtype: str
+ """
+ return self._resource_type
+
+ @resource_type.setter
+ def resource_type(self, resource_type):
+ """Sets the resource_type of this GPUConfig.
+
+
+ :param resource_type: The resource_type of this GPUConfig. # noqa: E501
+ :type: str
+ """
+
+ self._resource_type = resource_type
+
+ @property
+ def node_selector(self):
+ """Gets the node_selector of this GPUConfig. # noqa: E501
+
+
+ :return: The node_selector of this GPUConfig. # noqa: E501
+ :rtype: dict(str, str)
+ """
+ return self._node_selector
+
+ @node_selector.setter
+ def node_selector(self, node_selector):
+ """Sets the node_selector of this GPUConfig.
+
+
+ :param node_selector: The node_selector of this GPUConfig. # noqa: E501
+ :type: dict(str, str)
+ """
+
+ self._node_selector = node_selector
+
+ @property
+ def tolerations(self):
+ """Gets the tolerations of this GPUConfig. # noqa: E501
+
+
+ :return: The tolerations of this GPUConfig. # noqa: E501
+ :rtype: list[GPUToleration]
+ """
+ return self._tolerations
+
+ @tolerations.setter
+ def tolerations(self, tolerations):
+ """Sets the tolerations of this GPUConfig.
+
+
+ :param tolerations: The tolerations of this GPUConfig. # noqa: E501
+ :type: list[GPUToleration]
+ """
+
+ self._tolerations = tolerations
+
+ @property
+ def min_monthly_cost_per_gpu(self):
+ """Gets the min_monthly_cost_per_gpu of this GPUConfig. # noqa: E501
+
+
+ :return: The min_monthly_cost_per_gpu of this GPUConfig. # noqa: E501
+ :rtype: float
+ """
+ return self._min_monthly_cost_per_gpu
+
+ @min_monthly_cost_per_gpu.setter
+ def min_monthly_cost_per_gpu(self, min_monthly_cost_per_gpu):
+ """Sets the min_monthly_cost_per_gpu of this GPUConfig.
+
+
+ :param min_monthly_cost_per_gpu: The min_monthly_cost_per_gpu of this GPUConfig. # noqa: E501
+ :type: float
+ """
+
+ self._min_monthly_cost_per_gpu = min_monthly_cost_per_gpu
+
+ @property
+ def max_monthly_cost_per_gpu(self):
+ """Gets the max_monthly_cost_per_gpu of this GPUConfig. # noqa: E501
+
+
+ :return: The max_monthly_cost_per_gpu of this GPUConfig. # noqa: E501
+ :rtype: float
+ """
+ return self._max_monthly_cost_per_gpu
+
+ @max_monthly_cost_per_gpu.setter
+ def max_monthly_cost_per_gpu(self, max_monthly_cost_per_gpu):
+ """Sets the max_monthly_cost_per_gpu of this GPUConfig.
+
+
+ :param max_monthly_cost_per_gpu: The max_monthly_cost_per_gpu of this GPUConfig. # noqa: E501
+ :type: float
+ """
+
+ self._max_monthly_cost_per_gpu = max_monthly_cost_per_gpu
+
+ def to_dict(self):
+ """Returns the model properties as a dict"""
+ result = {}
+
+ for attr, _ in six.iteritems(self.swagger_types):
+ value = getattr(self, attr)
+ if isinstance(value, list):
+ result[attr] = list(map(
+ lambda x: x.to_dict() if hasattr(x, "to_dict") else x,
+ value
+ ))
+ elif hasattr(value, "to_dict"):
+ result[attr] = value.to_dict()
+ elif isinstance(value, dict):
+ result[attr] = dict(map(
+ lambda item: (item[0], item[1].to_dict())
+ if hasattr(item[1], "to_dict") else item,
+ value.items()
+ ))
+ else:
+ result[attr] = value
+ if issubclass(GPUConfig, dict):
+ for key, value in self.items():
+ result[key] = value
+
+ return result
+
+ def to_str(self):
+ """Returns the string representation of the model"""
+ return pprint.pformat(self.to_dict())
+
+ def __repr__(self):
+ """For `print` and `pprint`"""
+ return self.to_str()
+
+ def __eq__(self, other):
+ """Returns true if both objects are equal"""
+ if not isinstance(other, GPUConfig):
+ return False
+
+ return self.__dict__ == other.__dict__
+
+ def __ne__(self, other):
+ """Returns true if both objects are not equal"""
+ return not self == other
diff --git a/python/sdk/client/models/gpu_toleration.py b/python/sdk/client/models/gpu_toleration.py
new file mode 100644
index 000000000..69de71a0b
--- /dev/null
+++ b/python/sdk/client/models/gpu_toleration.py
@@ -0,0 +1,214 @@
+# coding: utf-8
+
+"""
+ Merlin
+
+ API Guide for accessing Merlin's model management, deployment, and serving functionalities # noqa: E501
+
+ OpenAPI spec version: 0.14.0
+
+ Generated by: https://github.com/swagger-api/swagger-codegen.git
+"""
+
+import pprint
+import re # noqa: F401
+
+import six
+
+class GPUToleration(object):
+ """NOTE: This class is auto generated by the swagger code generator program.
+
+ Do not edit the class manually.
+ """
+ """
+ Attributes:
+ swagger_types (dict): The key is attribute name
+ and the value is attribute type.
+ attribute_map (dict): The key is attribute name
+ and the value is json key in definition.
+ """
+ swagger_types = {
+ 'key': 'str',
+ 'operator': 'str',
+ 'value': 'str',
+ 'effect': 'str',
+ 'toleration_seconds': 'int'
+ }
+
+ attribute_map = {
+ 'key': 'key',
+ 'operator': 'operator',
+ 'value': 'value',
+ 'effect': 'effect',
+ 'toleration_seconds': 'toleration_seconds'
+ }
+
+ def __init__(self, key=None, operator=None, value=None, effect=None, toleration_seconds=None): # noqa: E501
+ """GPUToleration - a model defined in Swagger""" # noqa: E501
+ self._key = None
+ self._operator = None
+ self._value = None
+ self._effect = None
+ self._toleration_seconds = None
+ self.discriminator = None
+ if key is not None:
+ self.key = key
+ if operator is not None:
+ self.operator = operator
+ if value is not None:
+ self.value = value
+ if effect is not None:
+ self.effect = effect
+ if toleration_seconds is not None:
+ self.toleration_seconds = toleration_seconds
+
+ @property
+ def key(self):
+ """Gets the key of this GPUToleration. # noqa: E501
+
+
+ :return: The key of this GPUToleration. # noqa: E501
+ :rtype: str
+ """
+ return self._key
+
+ @key.setter
+ def key(self, key):
+ """Sets the key of this GPUToleration.
+
+
+ :param key: The key of this GPUToleration. # noqa: E501
+ :type: str
+ """
+
+ self._key = key
+
+ @property
+ def operator(self):
+ """Gets the operator of this GPUToleration. # noqa: E501
+
+
+ :return: The operator of this GPUToleration. # noqa: E501
+ :rtype: str
+ """
+ return self._operator
+
+ @operator.setter
+ def operator(self, operator):
+ """Sets the operator of this GPUToleration.
+
+
+ :param operator: The operator of this GPUToleration. # noqa: E501
+ :type: str
+ """
+
+ self._operator = operator
+
+ @property
+ def value(self):
+ """Gets the value of this GPUToleration. # noqa: E501
+
+
+ :return: The value of this GPUToleration. # noqa: E501
+ :rtype: str
+ """
+ return self._value
+
+ @value.setter
+ def value(self, value):
+ """Sets the value of this GPUToleration.
+
+
+ :param value: The value of this GPUToleration. # noqa: E501
+ :type: str
+ """
+
+ self._value = value
+
+ @property
+ def effect(self):
+ """Gets the effect of this GPUToleration. # noqa: E501
+
+
+ :return: The effect of this GPUToleration. # noqa: E501
+ :rtype: str
+ """
+ return self._effect
+
+ @effect.setter
+ def effect(self, effect):
+ """Sets the effect of this GPUToleration.
+
+
+ :param effect: The effect of this GPUToleration. # noqa: E501
+ :type: str
+ """
+
+ self._effect = effect
+
+ @property
+ def toleration_seconds(self):
+ """Gets the toleration_seconds of this GPUToleration. # noqa: E501
+
+
+ :return: The toleration_seconds of this GPUToleration. # noqa: E501
+ :rtype: int
+ """
+ return self._toleration_seconds
+
+ @toleration_seconds.setter
+ def toleration_seconds(self, toleration_seconds):
+ """Sets the toleration_seconds of this GPUToleration.
+
+
+ :param toleration_seconds: The toleration_seconds of this GPUToleration. # noqa: E501
+ :type: int
+ """
+
+ self._toleration_seconds = toleration_seconds
+
+ def to_dict(self):
+ """Returns the model properties as a dict"""
+ result = {}
+
+ for attr, _ in six.iteritems(self.swagger_types):
+ value = getattr(self, attr)
+ if isinstance(value, list):
+ result[attr] = list(map(
+ lambda x: x.to_dict() if hasattr(x, "to_dict") else x,
+ value
+ ))
+ elif hasattr(value, "to_dict"):
+ result[attr] = value.to_dict()
+ elif isinstance(value, dict):
+ result[attr] = dict(map(
+ lambda item: (item[0], item[1].to_dict())
+ if hasattr(item[1], "to_dict") else item,
+ value.items()
+ ))
+ else:
+ result[attr] = value
+ if issubclass(GPUToleration, dict):
+ for key, value in self.items():
+ result[key] = value
+
+ return result
+
+ def to_str(self):
+ """Returns the string representation of the model"""
+ return pprint.pformat(self.to_dict())
+
+ def __repr__(self):
+ """For `print` and `pprint`"""
+ return self.to_str()
+
+ def __eq__(self, other):
+ """Returns true if both objects are equal"""
+ if not isinstance(other, GPUToleration):
+ return False
+
+ return self.__dict__ == other.__dict__
+
+ def __ne__(self, other):
+ """Returns true if both objects are not equal"""
+ return not self == other
diff --git a/python/sdk/client/models/resource_request.py b/python/sdk/client/models/resource_request.py
index 9aae0c14f..38480f1ac 100644
--- a/python/sdk/client/models/resource_request.py
+++ b/python/sdk/client/models/resource_request.py
@@ -32,9 +32,8 @@ class ResourceRequest(object):
'max_replica': 'int',
'cpu_request': 'str',
'memory_request': 'str',
- 'gpu_resource_type': 'str',
- 'gpu_request': 'str',
- 'gpu_node_selector': 'dict(str, str)'
+ 'gpu_name': 'str',
+ 'gpu_request': 'str'
}
attribute_map = {
@@ -42,20 +41,18 @@ class ResourceRequest(object):
'max_replica': 'max_replica',
'cpu_request': 'cpu_request',
'memory_request': 'memory_request',
- 'gpu_resource_type': 'gpu_resource_type',
- 'gpu_request': 'gpu_request',
- 'gpu_node_selector': 'gpu_node_selector'
+ 'gpu_name': 'gpu_name',
+ 'gpu_request': 'gpu_request'
}
- def __init__(self, min_replica=None, max_replica=None, cpu_request=None, memory_request=None, gpu_resource_type=None, gpu_request=None, gpu_node_selector=None): # noqa: E501
+ def __init__(self, min_replica=None, max_replica=None, cpu_request=None, memory_request=None, gpu_name=None, gpu_request=None): # noqa: E501
"""ResourceRequest - a model defined in Swagger""" # noqa: E501
self._min_replica = None
self._max_replica = None
self._cpu_request = None
self._memory_request = None
- self._gpu_resource_type = None
+ self._gpu_name = None
self._gpu_request = None
- self._gpu_node_selector = None
self.discriminator = None
if min_replica is not None:
self.min_replica = min_replica
@@ -65,12 +62,10 @@ def __init__(self, min_replica=None, max_replica=None, cpu_request=None, memory_
self.cpu_request = cpu_request
if memory_request is not None:
self.memory_request = memory_request
- if gpu_resource_type is not None:
- self.gpu_resource_type = gpu_resource_type
+ if gpu_name is not None:
+ self.gpu_name = gpu_name
if gpu_request is not None:
self.gpu_request = gpu_request
- if gpu_node_selector is not None:
- self.gpu_node_selector = gpu_node_selector
@property
def min_replica(self):
@@ -157,25 +152,25 @@ def memory_request(self, memory_request):
self._memory_request = memory_request
@property
- def gpu_resource_type(self):
- """Gets the gpu_resource_type of this ResourceRequest. # noqa: E501
+ def gpu_name(self):
+ """Gets the gpu_name of this ResourceRequest. # noqa: E501
- :return: The gpu_resource_type of this ResourceRequest. # noqa: E501
+ :return: The gpu_name of this ResourceRequest. # noqa: E501
:rtype: str
"""
- return self._gpu_resource_type
+ return self._gpu_name
- @gpu_resource_type.setter
- def gpu_resource_type(self, gpu_resource_type):
- """Sets the gpu_resource_type of this ResourceRequest.
+ @gpu_name.setter
+ def gpu_name(self, gpu_name):
+ """Sets the gpu_name of this ResourceRequest.
- :param gpu_resource_type: The gpu_resource_type of this ResourceRequest. # noqa: E501
+ :param gpu_name: The gpu_name of this ResourceRequest. # noqa: E501
:type: str
"""
- self._gpu_resource_type = gpu_resource_type
+ self._gpu_name = gpu_name
@property
def gpu_request(self):
@@ -198,27 +193,6 @@ def gpu_request(self, gpu_request):
self._gpu_request = gpu_request
- @property
- def gpu_node_selector(self):
- """Gets the gpu_node_selector of this ResourceRequest. # noqa: E501
-
-
- :return: The gpu_node_selector of this ResourceRequest. # noqa: E501
- :rtype: dict(str, str)
- """
- return self._gpu_node_selector
-
- @gpu_node_selector.setter
- def gpu_node_selector(self, gpu_node_selector):
- """Sets the gpu_node_selector of this ResourceRequest.
-
-
- :param gpu_node_selector: The gpu_node_selector of this ResourceRequest. # noqa: E501
- :type: dict(str, str)
- """
-
- self._gpu_node_selector = gpu_node_selector
-
def to_dict(self):
"""Returns the model properties as a dict"""
result = {}
diff --git a/python/sdk/merlin/model.py b/python/sdk/merlin/model.py
index 4e611bb21..e727d66e8 100644
--- a/python/sdk/merlin/model.py
+++ b/python/sdk/merlin/model.py
@@ -33,6 +33,7 @@
from docker import APIClient
from docker.errors import BuildError
from docker.models.containers import Container
+from merlin import pyfunc
from merlin.autoscaling import (RAW_DEPLOYMENT_DEFAULT_AUTOSCALING_POLICY,
SERVERLESS_DEFAULT_AUTOSCALING_POLICY,
AutoscalingPolicy)
@@ -57,7 +58,6 @@
from mlflow.pyfunc import PythonModel
import mlflow
-from merlin import pyfunc
# Ensure backward compatibility after moving PyFuncModel and PyFuncV2Model to pyfunc.py
# This allows users to do following import statement
@@ -1045,7 +1045,7 @@ def deploy(self, environment_name: str = None,
if resource_request is None:
env_api = EnvironmentApi(self._api_client)
env_list = env_api.environments_get()
-
+
for env in env_list:
if env.name == target_env_name:
resource_request = ResourceRequest(
@@ -1054,7 +1054,7 @@ def deploy(self, environment_name: str = None,
env.default_resource_request.cpu_request,
env.default_resource_request.memory_request,
)
-
+
# This case is when the default resource request is not specified in the environment config
if resource_request is None:
raise ValueError("resource request must be specified")
@@ -1064,20 +1064,19 @@ def deploy(self, environment_name: str = None,
target_resource_request = client.ResourceRequest(
resource_request.min_replica, resource_request.max_replica,
resource_request.cpu_request, resource_request.memory_request)
-
+
if resource_request.gpu_request is not None and resource_request.gpu_name is not None:
env_api = EnvironmentApi(self._api_client)
env_list = env_api.environments_get()
for env in env_list:
for gpu in env.gpus:
- if resource_request.gpu_name == gpu.display_name:
+ if resource_request.gpu_name == gpu.name:
if resource_request.gpu_request not in gpu.values:
raise ValueError(f"Invalid GPU request count. Supported GPUs count for {resource_request.gpu_name} is {gpu.values}")
-
+
+ target_resource_request.gpu_name = resource_request.gpu_name
target_resource_request.gpu_request = resource_request.gpu_request
- target_resource_request.gpu_resource_type = gpu.resource_type
- target_resource_request.gpu_node_selector = gpu.node_selector
break
target_env_vars = []
@@ -1119,12 +1118,23 @@ def deploy(self, environment_name: str = None,
autoscaling_policy.target_value),
protocol=protocol.value
)
-
- endpoint = endpoint_api \
- .models_model_id_versions_version_id_endpoint_post(int(model.id),
- int(self.id),
- body=endpoint.to_dict())
-
+ current_endpoint = self.endpoint
+ if current_endpoint is not None:
+ # This allows a serving deployment to be update while it is serving
+ if current_endpoint.status == Status.SERVING:
+ endpoint.status = Status.SERVING.value
+ else:
+ endpoint.status = Status.RUNNING.value
+ endpoint = endpoint_api \
+ .models_model_id_versions_version_id_endpoint_endpoint_id_put(int(model.id),
+ int(self.id),
+ current_endpoint.id,
+ body=endpoint.to_dict())
+ else:
+ endpoint = endpoint_api \
+ .models_model_id_versions_version_id_endpoint_post(int(model.id),
+ int(self.id),
+ body=endpoint.to_dict())
bar = pyprind.ProgBar(100, track_time=True,
title=f"Deploying model {model.name} version "
f"{self.id}")
@@ -1139,7 +1149,7 @@ def deploy(self, environment_name: str = None,
sleep(5)
bar.stop()
- if endpoint.status != "running":
+ if endpoint.status != "running" and endpoint.status != "serving":
raise ModelEndpointDeploymentError(model.name, self.id, endpoint.message)
log_url = f"{self.url}/{self.id}/endpoints/{endpoint.id}/logs"
@@ -1147,7 +1157,7 @@ def deploy(self, environment_name: str = None,
f"\nView model version logs: {log_url}")
self._version_endpoints = self.list_endpoint()
-
+
return VersionEndpoint(endpoint, log_url)
def create_transformer_spec(self, transformer: Transformer, target_env_name: str) -> client.Transformer:
@@ -1514,7 +1524,7 @@ def _wait_build_complete(self, logs):
if image_id:
return
raise BuildError('Unknown', logs)
-
+
def delete_model_version(self) -> int:
"""
Delete this model version. Please note that any inactive related entity (endpoints and prediction jobs) will get deleted by this process.
diff --git a/python/sdk/merlin/resource_request.py b/python/sdk/merlin/resource_request.py
index 52d3df81e..2ab0ce761 100644
--- a/python/sdk/merlin/resource_request.py
+++ b/python/sdk/merlin/resource_request.py
@@ -14,6 +14,7 @@
from typing import Optional
+
class ResourceRequest:
"""
The resource requirement and replicas requests for model version endpoint.
@@ -63,15 +64,15 @@ def memory_request(self, memory_request):
@property
def gpu_request(self) -> Optional[str]:
return self._gpu_request
-
+
@gpu_request.setter
def gpu_request(self, gpu_request):
self._gpu_request = gpu_request
-
+
@property
def gpu_name(self) -> Optional[str]:
return self._gpu_name
-
+
@gpu_name.setter
def gpu_name(self, gpu_name):
self._gpu_name = gpu_name
diff --git a/python/sdk/merlin/version.py b/python/sdk/merlin/version.py
index dd7993990..ff1f7b009 100644
--- a/python/sdk/merlin/version.py
+++ b/python/sdk/merlin/version.py
@@ -12,4 +12,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-VERSION = "0.25.0"
+VERSION = "0.0.0"
diff --git a/python/sdk/test/integration_test.py b/python/sdk/test/integration_test.py
index 529e65b09..2cf30ea0e 100644
--- a/python/sdk/test/integration_test.py
+++ b/python/sdk/test/integration_test.py
@@ -18,7 +18,6 @@
import pandas as pd
import pytest
-from merlin.endpoint import Status
from merlin.logger import Logger, LoggerConfig, LoggerMode
from merlin.model import ModelType
from merlin.resource_request import ResourceRequest
@@ -26,7 +25,7 @@
from recursive_diff import recursive_eq
import merlin
-from merlin import DeploymentMode
+from merlin import DeploymentMode, MetricsType
request_json = {"instances": [[2.8, 1.0, 6.8, 0.4], [3.1, 1.4, 4.5, 1.6]]}
tensorflow_request_json = {
@@ -412,13 +411,25 @@ def test_resource_request(
merlin.undeploy(v)
+
@pytest.mark.gpu
@pytest.mark.integration
-@pytest.mark.parametrize("deployment_mode", [DeploymentMode.RAW_DEPLOYMENT, DeploymentMode.SERVERLESS])
-def test_resource_request_with_gpu(integration_test_url, project_name, deployment_mode, use_google_oauth, requests, gpu_config):
+@pytest.mark.parametrize(
+ "deployment_mode", [DeploymentMode.RAW_DEPLOYMENT, DeploymentMode.SERVERLESS]
+)
+def test_resource_request_with_gpu(
+ integration_test_url,
+ project_name,
+ deployment_mode,
+ use_google_oauth,
+ requests,
+ gpu_config,
+):
merlin.set_url(integration_test_url, use_google_oauth=use_google_oauth)
merlin.set_project(project_name)
- merlin.set_model(f"resource-request-with-gpu-{deployment_mode_suffix(deployment_mode)}", ModelType.XGBOOST)
+ merlin.set_model(
+ f"gpu-{deployment_mode_suffix(deployment_mode)}", ModelType.XGBOOST
+ )
model_dir = "test/xgboost-model"
@@ -435,7 +446,10 @@ def test_resource_request_with_gpu(integration_test_url, project_name, deploymen
resource_request = ResourceRequest(1, 1, "100m", "200Mi", **gpu_config)
endpoint = merlin.deploy(
- v, environment_name=default_env.name, resource_request=resource_request, deployment_mode=deployment_mode
+ v,
+ environment_name=default_env.name,
+ resource_request=resource_request,
+ deployment_mode=deployment_mode,
)
resp = requests.post(f"{endpoint.url}", json=request_json)
@@ -1046,5 +1060,63 @@ def test_deployment_mode_for_serving_model(
undeploy_all_version()
+@pytest.mark.integration
+def test_redeploy_model(integration_test_url, project_name, use_google_oauth, requests):
+ """
+ Validate that calling the 'merlin.deploy' twice in a row redeploys a Merlin model
+ """
+
+ merlin.set_url(integration_test_url, use_google_oauth=use_google_oauth)
+ merlin.set_project(project_name)
+ merlin.set_model("model-sdk-redeploy", ModelType.TENSORFLOW)
+ model_dir = "test/tensorflow-model"
+
+ undeploy_all_version()
+
+ # Upload new model version: v1
+ with merlin.new_model_version() as v1:
+ merlin.log_model(model_dir=model_dir)
+
+ # Deploy using serverless with RPS autoscaling policy
+ endpoint = merlin.deploy(
+ v1,
+ autoscaling_policy=merlin.AutoscalingPolicy(
+ metrics_type=merlin.MetricsType.RPS, target_value=20
+ ),
+ )
+
+ resp = requests.post(f"{endpoint.url}", json=tensorflow_request_json)
+
+ assert resp.status_code == 200
+ assert resp.json() is not None
+ assert len(resp.json()["predictions"]) == len(tensorflow_request_json["instances"])
+
+ # Check the autoscaling policy of v1
+ assert endpoint.autoscaling_policy.metrics_type == MetricsType.RPS
+ assert endpoint.autoscaling_policy.target_value == 20
+
+ # Deploy v2 using raw_deployment with CPU autoscaling policy
+ new_endpoint = merlin.deploy(
+ v1,
+ autoscaling_policy=merlin.AutoscalingPolicy(
+ metrics_type=merlin.MetricsType.CPU_UTILIZATION, target_value=10
+ ),
+ )
+
+ resp = requests.post(f"{new_endpoint.url}", json=tensorflow_request_json)
+
+ assert resp.status_code == 200
+ assert resp.json() is not None
+ assert len(resp.json()["predictions"]) == len(tensorflow_request_json["instances"])
+
+ # Check that the endpoint remains the same
+ assert endpoint.url == new_endpoint.url
+ # Check the autoscaling policy of v2
+ assert new_endpoint.autoscaling_policy.metrics_type == MetricsType.CPU_UTILIZATION
+ assert new_endpoint.autoscaling_policy.target_value == 10
+
+ undeploy_all_version()
+
+
def deployment_mode_suffix(deployment_mode: DeploymentMode):
return deployment_mode.value.lower()[0:1]
diff --git a/python/sdk/test/model_test.py b/python/sdk/test/model_test.py
index d9c66ed92..c5ac44e17 100644
--- a/python/sdk/test/model_test.py
+++ b/python/sdk/test/model_test.py
@@ -18,11 +18,11 @@
import client
import client as cl
+import merlin
import pytest
-from merlin.autoscaling import (
- RAW_DEPLOYMENT_DEFAULT_AUTOSCALING_POLICY,
- SERVERLESS_DEFAULT_AUTOSCALING_POLICY,
-)
+from merlin import AutoscalingPolicy, DeploymentMode, MetricsType
+from merlin.autoscaling import (RAW_DEPLOYMENT_DEFAULT_AUTOSCALING_POLICY,
+ SERVERLESS_DEFAULT_AUTOSCALING_POLICY)
from merlin.batch.config import PredictionJobConfig, ResultType
from merlin.batch.job import JobStatus
from merlin.batch.sink import BigQuerySink, SaveMode
@@ -32,17 +32,18 @@
from merlin.protocol import Protocol
from urllib3_mock import Responses
-import merlin
-from merlin import AutoscalingPolicy, DeploymentMode, MetricsType
-
responses = Responses("requests.packages.urllib3")
default_resource_request = cl.ResourceRequest(1, 1, "100m", "128Mi")
-gpu = cl.GPU(
+gpu = cl.GPUConfig(
+ name="nvidia-tesla-p4",
values=["1", "4", "8"],
- display_name="nvidia-tesla-p4",
resource_type="nvidia.com/gpu",
node_selector={"cloud.google.com/gke-accelerator": "nvidia-tesla-p4"},
+ tolerations=[
+ cl.GPUToleration(key="caraml/nvidia-tesla-p4", operator="Equal", value="enabled", effect="NoSchedule"),
+ cl.GPUToleration(key="nvidia.com/gpu", operator="Equal", value="present", effect="NoSchedule"),
+ ],
)
env_1 = cl.Environment(
@@ -96,9 +97,8 @@
max_replica=1,
cpu_request="100m",
memory_request="128Mi",
+ gpu_name="nvidia-tesla-p4",
gpu_request="1",
- gpu_resource_type="nvidia.com/gpu",
- gpu_node_selector={"cloud.google.com/gke-accelerator": "nvidia-tesla-p4"},
)
ep5 = cl.VersionEndpoint(
"789",
@@ -346,6 +346,14 @@ def test_deploy(self, version):
status=200,
content_type="application/json",
)
+ # This is the additional check which deploy makes to determine if there are any existing endpoints associated
+ responses.add(
+ "GET",
+ "/v1/models/1/versions/1/endpoint",
+ body=json.dumps([]),
+ status=200,
+ content_type="application/json",
+ )
responses.add(
"POST",
"/v1/models/1/versions/1/endpoint",
@@ -381,6 +389,14 @@ def test_deploy_upiv1(self, version):
status=200,
content_type="application/json",
)
+ # This is the additional check which deploy makes to determine if there are any existing endpoints associated
+ responses.add(
+ "GET",
+ "/v1/models/1/versions/1/endpoint",
+ body=json.dumps([]),
+ status=200,
+ content_type="application/json",
+ )
responses.add(
"POST",
"/v1/models/1/versions/1/endpoint",
@@ -416,6 +432,14 @@ def test_deploy_using_raw_deployment_mode(self, version):
status=200,
content_type="application/json",
)
+ # This is the additional check which deploy makes to determine if there are any existing endpoints associated
+ responses.add(
+ "GET",
+ "/v1/models/1/versions/1/endpoint",
+ body=json.dumps([]),
+ status=200,
+ content_type="application/json",
+ )
responses.add(
"POST",
"/v1/models/1/versions/1/endpoint",
@@ -452,6 +476,13 @@ def test_deploy_with_autoscaling_policy(self, version):
status=200,
content_type="application/json",
)
+ # This is the additional check which deploy makes to determine if there are any existing endpoints associated
+ responses.add(
+ "GET", "/v1/models/1/versions/1/endpoint",
+ body=json.dumps([]),
+ status=200,
+ content_type="application/json",
+ )
responses.add(
"POST",
"/v1/models/1/versions/1/endpoint",
@@ -505,6 +536,14 @@ def test_deploy_default_env(self, version):
status=200,
content_type="application/json",
)
+ # This is the additional check which deploy makes to determine if there are any existing endpoints associated
+ responses.add(
+ "GET",
+ "/v1/models/1/versions/1/endpoint",
+ body=json.dumps([]),
+ status=200,
+ content_type="application/json",
+ )
responses.add(
"POST",
"/v1/models/1/versions/1/endpoint",
@@ -528,6 +567,52 @@ def test_deploy_default_env(self, version):
assert endpoint.environment.cluster == env_1.cluster
assert endpoint.environment.name == env_1.name
+ @responses.activate
+ def test_redeploy_model(self, version):
+ responses.add(
+ "GET",
+ "/v1/environments",
+ body=json.dumps([env_1.to_dict(), env_2.to_dict()]),
+ status=200,
+ content_type="application/json",
+ )
+ # This is the additional check which deploy makes to determine if there are any existing endpoints associated
+ responses.add(
+ "GET",
+ "/v1/models/1/versions/1/endpoint",
+ body=json.dumps([ep3.to_dict()]),
+ status=200,
+ content_type="application/json",
+ )
+ responses.add(
+ "PUT",
+ "/v1/models/1/versions/1/endpoint/1234",
+ body=json.dumps(ep4.to_dict()),
+ status=200,
+ content_type="application/json",
+ )
+ responses.add(
+ "GET",
+ "/v1/models/1/versions/1/endpoint",
+ body=json.dumps([ep4.to_dict()]),
+ status=200,
+ content_type="application/json",
+ )
+
+ # Redeployment (add autoscaling policy and change deployment mode)
+ endpoint = version.deploy(environment_name=env_1.name,
+ autoscaling_policy=AutoscalingPolicy(metrics_type=MetricsType.CPU_UTILIZATION,
+ target_value=10))
+
+ assert endpoint.id == ep4.id
+ assert endpoint.status.value == ep4.status
+ assert endpoint.environment_name == ep4.environment_name
+ assert endpoint.environment.cluster == env_1.cluster
+ assert endpoint.environment.name == env_1.name
+ assert endpoint.deployment_mode == DeploymentMode.SERVERLESS
+ assert endpoint.autoscaling_policy.metrics_type == MetricsType.CPU_UTILIZATION
+ assert endpoint.autoscaling_policy.target_value == 10
+
@responses.activate
def test_deploy_with_gpu(self, version):
responses.add(
@@ -561,16 +646,12 @@ def test_deploy_with_gpu(self, version):
assert endpoint.environment.name == env_3.name
assert endpoint.deployment_mode == DeploymentMode.SERVERLESS
assert (
- endpoint.resource_request.gpu_request
- == resource_request_with_gpu.gpu_request
- )
- assert (
- endpoint.resource_request.gpu_resource_type
- == resource_request_with_gpu.gpu_resource_type
+ endpoint.resource_request.gpu_name
+ == resource_request_with_gpu.gpu_name
)
assert (
- endpoint.resource_request.gpu_node_selector
- == resource_request_with_gpu.gpu_node_selector
+ endpoint.resource_request.gpu_request
+ == resource_request_with_gpu.gpu_request
)
@responses.activate
diff --git a/swagger.yaml b/swagger.yaml
index c7a2dc167..346039576 100644
--- a/swagger.yaml
+++ b/swagger.yaml
@@ -1056,7 +1056,7 @@ definitions:
gpus:
type: "array"
items:
- $ref: "#/definitions/GPU"
+ $ref: "#/definitions/GPUConfig"
created_at:
type: "string"
format: "date-time"
@@ -1436,14 +1436,10 @@ definitions:
type: "string"
memory_request:
type: "string"
- gpu_resource_type:
+ gpu_name:
type: "string"
gpu_request:
type: "string"
- gpu_node_selector:
- type: "object"
- additionalProperties:
- type: "string"
AutoscalingPolicy:
type: "object"
@@ -1538,23 +1534,44 @@ definitions:
items:
$ref: "#/definitions/EnvVar"
- GPU:
+ GPUConfig:
type: "object"
properties:
+ name:
+ type: "string"
values:
type: "array"
items:
type: string
- display_name:
- type: "string"
resource_type:
type: "string"
node_selector:
type: "object"
additionalProperties:
type: "string"
- monthly_cost_per_gpu:
+ tolerations:
+ type: "array"
+ items:
+ $ref: "#/definitions/GPUToleration"
+ min_monthly_cost_per_gpu:
type: "number"
+ max_monthly_cost_per_gpu:
+ type: "number"
+
+ GPUToleration:
+ type: "object"
+ properties:
+ key:
+ type: "string"
+ operator:
+ type: "string"
+ value:
+ type: "string"
+ effect:
+ type: "string"
+ toleration_seconds:
+ type: "integer"
+ format: "int64"
PredictionJobResourceRequest:
type: "object"
diff --git a/ui/src/components/ResourcesConfigTable.js b/ui/src/components/ResourcesConfigTable.js
index 1a0c98b79..cce0d4e9e 100644
--- a/ui/src/components/ResourcesConfigTable.js
+++ b/ui/src/components/ResourcesConfigTable.js
@@ -14,32 +14,53 @@
* limitations under the License.
*/
-import React from "react";
-import PropTypes from "prop-types";
import { EuiDescriptionList } from "@elastic/eui";
+import PropTypes from "prop-types";
+import React from "react";
export const ResourcesConfigTable = ({
- resourceRequest: { cpu_request, memory_request, min_replica, max_replica }
+ resourceRequest: {
+ cpu_request,
+ memory_request,
+ min_replica,
+ max_replica,
+ gpu_name,
+ gpu_request,
+ },
}) => {
const items = [
{
title: "CPU Request",
- description: cpu_request
+ description: cpu_request,
},
{
title: "Memory Request",
- description: memory_request
+ description: memory_request,
},
{
title: "Min Replicas",
- description: min_replica
+ description: min_replica,
},
{
title: "Max Replicas",
- description: max_replica
- }
+ description: max_replica,
+ },
];
+ if (gpu_name !== undefined && gpu_name !== "") {
+ items.push({
+ title: "GPU Name",
+ description: gpu_name,
+ });
+ }
+
+ if (gpu_request !== undefined && gpu_request !== "0") {
+ items.push({
+ title: "GPU Request",
+ description: gpu_request,
+ });
+ }
+
return (