Granular max-in-flight updates (cloudfoundry#4124)

* Scale action now compares number of starting/running non-routable instances to max-in-flight, instead of waiting for all instances to become routable * Scale action does not continue scaling up when any instances are 'unhealthy' (e.g. 'crashed', 'down', etc) as it's difficult to determine if unhealthy instances belong to the 'max in flight' group * Number of desired nondeploying instances now recalculated each iteration instead of decrementing by 'max-in-flight' without checking if it's in a correct state. This mitigates bugs where deployment trains (continually creating new deployments before the previous had completed) could result in number app instances exceeding the max-in-flight limit.
sap-contributions · Dec 5, 2024 · 749e6fc · 749e6fc
1 parent 7536b72
commit 749e6fc
Show file tree

Hide file tree

Showing 2 changed files with 317 additions and 49 deletions.
diff --git a/lib/cloud_controller/deployment_updater/actions/scale.rb b/lib/cloud_controller/deployment_updater/actions/scale.rb
@@ -6,6 +6,7 @@ module VCAP::CloudController
   module DeploymentUpdater
     module Actions
       class Scale
+        HEALTHY_STATES = [VCAP::CloudController::Diego::LRP_RUNNING, VCAP::CloudController::Diego::LRP_STARTING].freeze
         attr_reader :deployment, :logger, :app
 
         def initialize(deployment, logger)
@@ -17,14 +18,15 @@ def initialize(deployment, logger)
         def call
           deployment.db.transaction do
             return unless deployment.lock!.state == DeploymentModel::DEPLOYING_STATE
-            return unless all_instances_routable?
+
+            return unless can_scale? || can_downscale?
 
             app.lock!
+
             oldest_web_process_with_instances.lock!
             deploying_web_process.lock!
 
             deployment.update(
-              last_healthy_at: Time.now,
               state: DeploymentModel::DEPLOYING_STATE,
               status_value: DeploymentModel::ACTIVE_STATUS_VALUE,
               status_reason: DeploymentModel::DEPLOYING_STATUS_REASON
@@ -36,36 +38,88 @@ def call
             end
 
             ScaleDownCanceledProcesses.new(deployment).call
-            ScaleDownOldProcess.new(deployment, oldest_web_process_with_instances, desired_old_instances).call
 
-            deploying_web_process.update(instances: desired_new_instances)
+            scale_down_old_processes if can_downscale?
+
+            if can_scale?
+              deploying_web_process.update(instances: desired_new_instances)
+              deployment.update(last_healthy_at: Time.now)
+            end
           end
         end
 
         private
 
-        def desired_old_instances
-          [(oldest_web_process_with_instances.instances - deployment.max_in_flight), 0].max
+        def scale_down_old_processes
+          instances_to_reduce = non_deploying_web_processes.map(&:instances).sum - desired_non_deploying_instances
+
+          return if instances_to_reduce <= 0
+
+          non_deploying_web_processes.each do |process|
+            if instances_to_reduce < process.instances
+              ScaleDownOldProcess.new(deployment, process, process.instances - instances_to_reduce).call
+              break
+            end
+
+            instances_to_reduce -= process.instances
+            ScaleDownOldProcess.new(deployment, process, 0).call
+          end
+        end
+
+        def can_scale?
+          starting_instances.count < deployment.max_in_flight &&
+            unhealthy_instances.count == 0 &&
+            routable_instances.count >= deploying_web_process.instances - deployment.max_in_flight
+        rescue CloudController::Errors::ApiError # the instances_reporter re-raises InstancesUnavailable as ApiError
+          logger.info("skipping-deployment-update-for-#{deployment.guid}")
+          false
+        end
+
+        def can_downscale?
+          non_deploying_web_processes.map(&:instances).sum > desired_non_deploying_instances
+        rescue CloudController::Errors::ApiError # the instances_reporter re-raises InstancesUnavailable as ApiError
+          logger.info("skipping-deployment-update-for-#{deployment.guid}")
+          false
+        end
+
+        def desired_non_deploying_instances
+          [deployment.original_web_process_instance_count - routable_instances.count, 0].max
         end
 
         def desired_new_instances
-          [deploying_web_process.instances + deployment.max_in_flight, deployment.original_web_process_instance_count].min
+          [routable_instances.count + deployment.max_in_flight, deployment.original_web_process_instance_count].min
         end
 
         def oldest_web_process_with_instances
           @oldest_web_process_with_instances ||= app.web_processes.select { |process| process.instances > 0 }.min_by { |p| [p.created_at, p.id] }
         end
 
+        def non_deploying_web_processes
+          app.web_processes.reject { |process| process.guid == deploying_web_process.guid }.sort_by { |p| [p.created_at, p.id] }
+        end
+
         def deploying_web_process
           @deploying_web_process ||= deployment.deploying_web_process
         end
 
-        def all_instances_routable?
-          instances = instance_reporters.all_instances_for_app(deployment.deploying_web_process)
-          instances.all? { |_, val| val[:state] == VCAP::CloudController::Diego::LRP_RUNNING && val[:routable] }
-        rescue CloudController::Errors::ApiError # the instances_reporter re-raises InstancesUnavailable as ApiError
-          logger.info("skipping-deployment-update-for-#{deployment.guid}")
-          false
+        def starting_instances
+          healthy_instances.reject { |_, val| val[:state] == VCAP::CloudController::Diego::LRP_RUNNING && val[:routable] }
+        end
+
+        def routable_instances
+          reported_instances.select { |_, val| val[:state] == VCAP::CloudController::Diego::LRP_RUNNING && val[:routable] }
+        end
+
+        def healthy_instances
+          reported_instances.select { |_, val| HEALTHY_STATES.include?(val[:state]) }
+        end
+
+        def unhealthy_instances
+          reported_instances.reject { |_, val| HEALTHY_STATES.include?(val[:state]) }
+        end
+
+        def reported_instances
+          @reported_instances = instance_reporters.all_instances_for_app(deploying_web_process)
         end
 
         def instance_reporters