Skip to content

Commit

Permalink
Granular max-in-flight updates (cloudfoundry#4124)
Browse files Browse the repository at this point in the history
* Scale action now compares number of starting/running non-routable instances to max-in-flight, instead of waiting for all instances to become routable
* Scale action does not continue scaling up when any instances are 'unhealthy' (e.g. 'crashed', 'down', etc) as it's difficult to determine if unhealthy instances belong to the 'max in flight' group
* Number of desired nondeploying instances now recalculated each iteration instead of decrementing by 'max-in-flight' without checking if it's in a correct state. This mitigates bugs where deployment trains (continually creating new deployments before the previous had completed) could result in number app instances exceeding the max-in-flight limit.
  • Loading branch information
sethboyles authored Dec 5, 2024
1 parent 7536b72 commit 749e6fc
Show file tree
Hide file tree
Showing 2 changed files with 317 additions and 49 deletions.
80 changes: 67 additions & 13 deletions lib/cloud_controller/deployment_updater/actions/scale.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ module VCAP::CloudController
module DeploymentUpdater
module Actions
class Scale
HEALTHY_STATES = [VCAP::CloudController::Diego::LRP_RUNNING, VCAP::CloudController::Diego::LRP_STARTING].freeze
attr_reader :deployment, :logger, :app

def initialize(deployment, logger)
Expand All @@ -17,14 +18,15 @@ def initialize(deployment, logger)
def call
deployment.db.transaction do
return unless deployment.lock!.state == DeploymentModel::DEPLOYING_STATE
return unless all_instances_routable?

return unless can_scale? || can_downscale?

app.lock!

oldest_web_process_with_instances.lock!
deploying_web_process.lock!

deployment.update(
last_healthy_at: Time.now,
state: DeploymentModel::DEPLOYING_STATE,
status_value: DeploymentModel::ACTIVE_STATUS_VALUE,
status_reason: DeploymentModel::DEPLOYING_STATUS_REASON
Expand All @@ -36,36 +38,88 @@ def call
end

ScaleDownCanceledProcesses.new(deployment).call
ScaleDownOldProcess.new(deployment, oldest_web_process_with_instances, desired_old_instances).call

deploying_web_process.update(instances: desired_new_instances)
scale_down_old_processes if can_downscale?

if can_scale?
deploying_web_process.update(instances: desired_new_instances)
deployment.update(last_healthy_at: Time.now)
end
end
end

private

def desired_old_instances
[(oldest_web_process_with_instances.instances - deployment.max_in_flight), 0].max
def scale_down_old_processes
instances_to_reduce = non_deploying_web_processes.map(&:instances).sum - desired_non_deploying_instances

return if instances_to_reduce <= 0

non_deploying_web_processes.each do |process|
if instances_to_reduce < process.instances
ScaleDownOldProcess.new(deployment, process, process.instances - instances_to_reduce).call
break
end

instances_to_reduce -= process.instances
ScaleDownOldProcess.new(deployment, process, 0).call
end
end

def can_scale?
starting_instances.count < deployment.max_in_flight &&
unhealthy_instances.count == 0 &&
routable_instances.count >= deploying_web_process.instances - deployment.max_in_flight
rescue CloudController::Errors::ApiError # the instances_reporter re-raises InstancesUnavailable as ApiError
logger.info("skipping-deployment-update-for-#{deployment.guid}")
false
end

def can_downscale?
non_deploying_web_processes.map(&:instances).sum > desired_non_deploying_instances
rescue CloudController::Errors::ApiError # the instances_reporter re-raises InstancesUnavailable as ApiError
logger.info("skipping-deployment-update-for-#{deployment.guid}")
false
end

def desired_non_deploying_instances
[deployment.original_web_process_instance_count - routable_instances.count, 0].max
end

def desired_new_instances
[deploying_web_process.instances + deployment.max_in_flight, deployment.original_web_process_instance_count].min
[routable_instances.count + deployment.max_in_flight, deployment.original_web_process_instance_count].min
end

def oldest_web_process_with_instances
@oldest_web_process_with_instances ||= app.web_processes.select { |process| process.instances > 0 }.min_by { |p| [p.created_at, p.id] }
end

def non_deploying_web_processes
app.web_processes.reject { |process| process.guid == deploying_web_process.guid }.sort_by { |p| [p.created_at, p.id] }
end

def deploying_web_process
@deploying_web_process ||= deployment.deploying_web_process
end

def all_instances_routable?
instances = instance_reporters.all_instances_for_app(deployment.deploying_web_process)
instances.all? { |_, val| val[:state] == VCAP::CloudController::Diego::LRP_RUNNING && val[:routable] }
rescue CloudController::Errors::ApiError # the instances_reporter re-raises InstancesUnavailable as ApiError
logger.info("skipping-deployment-update-for-#{deployment.guid}")
false
def starting_instances
healthy_instances.reject { |_, val| val[:state] == VCAP::CloudController::Diego::LRP_RUNNING && val[:routable] }
end

def routable_instances
reported_instances.select { |_, val| val[:state] == VCAP::CloudController::Diego::LRP_RUNNING && val[:routable] }
end

def healthy_instances
reported_instances.select { |_, val| HEALTHY_STATES.include?(val[:state]) }
end

def unhealthy_instances
reported_instances.reject { |_, val| HEALTHY_STATES.include?(val[:state]) }
end

def reported_instances
@reported_instances = instance_reporters.all_instances_for_app(deploying_web_process)
end

def instance_reporters
Expand Down
Loading

0 comments on commit 749e6fc

Please sign in to comment.