From c29b27a1d09455bd18eb1135403e3af91ddf8b86 Mon Sep 17 00:00:00 2001 From: openwebsolns Date: Mon, 12 Dec 2022 07:05:39 -0500 Subject: [PATCH] feat: track alarms in bake times (#3) * feat: track alarms in bake times Experimental feature. * chore: self mutation Signed-off-by: github-actions Signed-off-by: github-actions Co-authored-by: Dayan Paez Co-authored-by: github-actions --- API.md | 78 +++++++++++ src/CodePipelineHelper.ts | 8 +- src/DeploymentSafetyEnforcer.function.ts | 166 ++++++++++++++++++++++- src/DeploymentSafetyEnforcer.ts | 78 +++++++++++ src/common.ts | 34 +++++ test/CodePipelineHelper.test.ts | 10 ++ 6 files changed, 369 insertions(+), 5 deletions(-) diff --git a/API.md b/API.md index 4adc8d0..f324e7e 100644 --- a/API.md +++ b/API.md @@ -339,6 +339,71 @@ The tree node. ## Structs +### BakeStepAlarmProps + +Alarm to inspect in bake step. + +#### Initializer + +```typescript +import { BakeStepAlarmProps } from 'cdk-deployment-constructs' + +const bakeStepAlarmProps: BakeStepAlarmProps = { ... } +``` + +#### Properties + +| **Name** | **Type** | **Description** | +| --- | --- | --- | +| alarm | aws-cdk-lib.aws_cloudwatch.IAlarm | The name of the alarm to monitor. | +| assumeRole | aws-cdk-lib.aws_iam.IRole | Role to assume in order to describe the alarm history. | +| treatMissingAlarm | string | Specify approval behavior if the alarm cannot be described. | + +--- + +##### `alarm`Required + +```typescript +public readonly alarm: IAlarm; +``` + +- *Type:* aws-cdk-lib.aws_cloudwatch.IAlarm + +The name of the alarm to monitor. + +--- + +##### `assumeRole`Optional + +```typescript +public readonly assumeRole: IRole; +``` + +- *Type:* aws-cdk-lib.aws_iam.IRole + +Role to assume in order to describe the alarm history. + +For cross-account support, first create this role in the target account +and add trust policy that trusts the pipeline account to assume it. + +--- + +##### `treatMissingAlarm`Optional + +```typescript +public readonly treatMissingAlarm: string; +``` + +- *Type:* string + +Specify approval behavior if the alarm cannot be described. + +Default: `REJECT`. Set to `IGNORE` if the alarm may not yet be created. +Note that failure to assume the role (if applicable) may also result in a +rejected approval. + +--- + ### BakeStepProps Props for creating a stage/wave bake approval step. @@ -356,6 +421,7 @@ const bakeStepProps: BakeStepProps = { ... } | **Name** | **Type** | **Description** | | --- | --- | --- | | bakeTime | aws-cdk-lib.Duration | How long to wait before approving the step. | +| rejectOnAlarms | BakeStepAlarmProps[] | Optionally watch the given alarm and reject if it fires. | --- @@ -371,6 +437,18 @@ How long to wait before approving the step. --- +##### `rejectOnAlarms`Optional + +```typescript +public readonly rejectOnAlarms: BakeStepAlarmProps[]; +``` + +- *Type:* BakeStepAlarmProps[] + +Optionally watch the given alarm and reject if it fires. + +--- + ### CodePipelineHelperProps Properties for `CodePipelineHelper`. diff --git a/src/CodePipelineHelper.ts b/src/CodePipelineHelper.ts index 244a9a4..4141d6c 100644 --- a/src/CodePipelineHelper.ts +++ b/src/CodePipelineHelper.ts @@ -129,7 +129,13 @@ export class CodePipelineHelper extends Construct { this.bakeSteps[id] = props; return new pipelines.ManualApprovalStep(id, { - comment: `DeploymentSafetyEnforcer/bake for ${props.bakeTime.toHumanString()}`, + comment: `DeploymentSafetyEnforcer/bake for ${ + props.bakeTime.toHumanString() + }${ + props.rejectOnAlarms + ? ` with alarms ${props.rejectOnAlarms.map((s) => s.alarm.alarmName)}` + : '' + }`, }); } diff --git a/src/DeploymentSafetyEnforcer.function.ts b/src/DeploymentSafetyEnforcer.function.ts index 41f4c94..9d1b0a0 100644 --- a/src/DeploymentSafetyEnforcer.function.ts +++ b/src/DeploymentSafetyEnforcer.function.ts @@ -1,6 +1,7 @@ import { Context } from 'aws-lambda'; import * as aws from 'aws-sdk'; import { + BakeStepAlarmSettings, BakeStepSettings, DeploymentSafetySettings, } from './common'; @@ -30,6 +31,7 @@ interface BakeStepApprovalProps { interface BakeStepAction { decision: 'APPROVE' | 'REJECT' | 'CONTINUE' | 'DONE'; actionName: string; + rejectReasons?: string[]; approvalProps?: BakeStepApprovalProps; } @@ -86,6 +88,68 @@ const transitionDisabledByEnforcer = (reason?: string) => { return true; }; +interface AlarmStateQuery { + readonly alarmName: string; + readonly startTime: Date; + readonly treatMissingAlarm: string; +} + +const getAlarmStates = async ( + alarms: AlarmStateQuery[], + cloudwatchClient: aws.CloudWatch, +) => { + let results: aws.CloudWatch.DescribeAlarmsOutput; + try { + results = await cloudwatchClient.describeAlarms({ + AlarmNames: alarms.map((a) => a.alarmName), + AlarmTypes: ['MetricAlarm', 'CompositeAlarm'], + }).promise(); + } catch (err) { + console.log('Received error while describing alarms', err); + results = {}; + } + + const resultsLookup: Record = {}; + [ + ...(results.MetricAlarms ?? []), + ...(results.CompositeAlarms ?? []), + ].forEach((alarm) => resultsLookup[alarm.AlarmName!] = alarm); + + return alarms.map((alarm) => { + if (!(alarm.alarmName in resultsLookup)) { + console.log(`Alarm ${alarm.alarmName}: not found when describing`); + return { + alarm, + state: 'MISSING', + }; + } + + const state = resultsLookup[alarm.alarmName]; + if (state.StateValue !== 'OK') { + console.log(`Alarm ${alarm.alarmName}: currently in state ${state.StateValue}`); + return { + alarm, + state: 'IN_ALARM', + }; + } + + if (state.StateUpdatedTimestamp && state.StateUpdatedTimestamp > alarm.startTime) { + // while it has recovered, it went into alarm since the start of the action + console.log(`Alarm ${alarm.alarmName}: transitioned on ${state.StateUpdatedTimestamp} > bake start ${alarm.startTime}`); + return { + alarm, + state: 'IN_ALARM', + }; + } + + console.log(`Alarm ${alarm.alarmName}: OK`); + return { + alarm, + state: 'OK', + }; + }); +}; + export const calculatePipelineTransitionActions = ( pipelineState: aws.CodePipeline.GetPipelineStateOutput, calendarsByPipelineStage: Record, @@ -218,6 +282,11 @@ const calculateBakeActions = async ( })); // 3. decide fate of in progress based on start times + const pendingAlarmDecision: { + readonly approvalProps: BakeStepApprovalProps; + readonly alarmSettings: BakeStepAlarmSettings[]; + }[] = []; + const now = Date.now(); await Promise.all( Object.values(inProgress).flatMap((props) => props).map(async (props) => { @@ -231,15 +300,90 @@ const calculateBakeActions = async ( actionName: props.actionName, approvalProps: props, }); + return; } - decisions.push({ - decision: 'CONTINUE', - actionName: props.actionName, + if ((bakeStep.alarmSettings ?? []).length > 0) { + pendingAlarmDecision.push({ + approvalProps: props, + alarmSettings: bakeStep.alarmSettings!, + }); + } else { + decisions.push({ + decision: 'CONTINUE', + actionName: props.actionName, + }); + } + }), + ); + + // 4. check for alarm decisions next + // For efficiency, group the alarm ARNs by '\n', using special + // placeholder value for those with no roles, so that we can use a single SDK + // client and call for all alarms in that group. + const NO_ROLE_PLACEHOLDER = 'NO-ROLE'; + const alarmNamesByRoleArn: Record = {}; + pendingAlarmDecision.forEach(({ approvalProps, alarmSettings }) => { + alarmSettings.forEach(({ alarmName, assumeRoleArn, treatMissingAlarm, region }) => { + const key = `${region}\n${assumeRoleArn ?? NO_ROLE_PLACEHOLDER}`; + if (!(key in alarmNamesByRoleArn)) { + alarmNamesByRoleArn[key] = []; + } + alarmNamesByRoleArn[key].push({ + alarmName, + treatMissingAlarm: treatMissingAlarm ?? 'REJECT', + startTime: startTimes[approvalProps.actionName], + }); + }); + }); + + const failedAlarmsByName = new Set(); + await Promise.all( + Object.entries(alarmNamesByRoleArn).map(async ([roleArnKey, alarms]) => { + const [region, roleArn] = roleArnKey.split('\n'); + let credentials: aws.Credentials | undefined; + if (roleArn !== NO_ROLE_PLACEHOLDER) { + credentials = new aws.ChainableTemporaryCredentials({ + params: { + RoleArn: roleArn, + RoleSessionName: 'DeploymentSafetyEnforcer', + }, + }); + + } + const cloudwatchClient = new aws.CloudWatch({ + credentials, + region, + }); + + const results = await getAlarmStates(alarms, cloudwatchClient); + results.forEach(({ alarm, state }) => { + if (state === 'IN_ALARM') { + failedAlarmsByName.add(alarm.alarmName); + } else if (state === 'MISSING' && alarm.treatMissingAlarm === 'REJECT') { + failedAlarmsByName.add(alarm.alarmName); + } }); }), ); + pendingAlarmDecision.forEach(({ approvalProps, alarmSettings }) => { + const failedAlarms = alarmSettings.filter((alarm) => failedAlarmsByName.has(alarm.alarmName)); + if (failedAlarms.length > 0) { + decisions.push({ + approvalProps, + decision: 'REJECT', + actionName: approvalProps.actionName, + rejectReasons: failedAlarms.map((a) => a.alarmName), + }); + } else { + decisions.push({ + decision: 'CONTINUE', + actionName: approvalProps.actionName, + }); + } + }); + return decisions; }; @@ -281,7 +425,21 @@ const execute = async ( ), ); - // second: approve all bake times + // second: reject/approve all bake times + await Promise.all( + bakeActions.filter((a) => a.decision === 'REJECT').map(({ approvalProps, rejectReasons }) => + codepipeline.putApprovalResult({ + actionName: approvalProps!.actionName!, + pipelineName: approvalProps!.pipelineName!, + stageName: approvalProps!.stageName!, + token: approvalProps!.token!, + result: { + status: 'Rejected', + summary: `DeploymentSafetyEnforcer@${requestId} due to ${rejectReasons?.join(', ')}`, + }, + }).promise(), + ), + ); await Promise.all( bakeActions.filter((a) => a.decision === 'APPROVE').map(({ approvalProps }) => codepipeline.putApprovalResult({ diff --git a/src/DeploymentSafetyEnforcer.ts b/src/DeploymentSafetyEnforcer.ts index 7a24f82..9c0103f 100644 --- a/src/DeploymentSafetyEnforcer.ts +++ b/src/DeploymentSafetyEnforcer.ts @@ -1,4 +1,5 @@ import * as cdk from 'aws-cdk-lib'; +import * as cloudwatch from 'aws-cdk-lib/aws-cloudwatch'; import * as codepipeline from 'aws-cdk-lib/aws-codepipeline'; import * as events from 'aws-cdk-lib/aws-events'; import * as targets from 'aws-cdk-lib/aws-events-targets'; @@ -8,6 +9,34 @@ import * as lambdaNode from 'aws-cdk-lib/aws-lambda-nodejs'; import { Construct } from 'constructs'; import * as common from './common'; +/** + * Alarm to inspect in bake step. + */ +export interface BakeStepAlarmProps { + /** + * The name of the alarm to monitor. + */ + readonly alarm: cloudwatch.IAlarm; + + /** + * Role to assume in order to describe the alarm history. + * + * For cross-account support, first create this role in the target account + * and add trust policy that trusts the pipeline account to assume it. + */ + readonly assumeRole?: iam.IRole; + + /** + * Specify approval behavior if the alarm cannot be described. + * + * Default: `REJECT`. Set to `IGNORE` if the alarm may not yet be created. + * Note that failure to assume the role (if applicable) may also result in a + * rejected approval. + */ + readonly treatMissingAlarm?: 'IGNORE' | 'REJECT'; + +} + /** * Props for creating a stage/wave bake approval step. */ @@ -16,6 +45,11 @@ export interface BakeStepProps { * How long to wait before approving the step. */ readonly bakeTime: cdk.Duration; + + /** + * Optionally watch the given alarm and reject if it fires. + */ + readonly rejectOnAlarms?: BakeStepAlarmProps[]; } /** @@ -118,12 +152,56 @@ export class DeploymentSafetyEnforcer extends Construct { ], }), ); + + const alarmHistoryRoles = new Set(); + const alarmArns = new Set(); + Object.values(props.bakeSteps!) + .flatMap((s) => s.rejectOnAlarms ?? []) + .forEach((alarm) => { + if (alarm.assumeRole) { + alarmHistoryRoles.add(alarm.assumeRole.roleArn); + } else { + // only need explicit alarm permission if role is not provided, + // as permissions are otherwise conferred by the role itself + alarmArns.add(alarm.alarm.alarmArn); + } + }); + + if (alarmHistoryRoles.size > 0) { + enforcerFunction.addToRolePolicy( + new iam.PolicyStatement({ + effect: iam.Effect.ALLOW, + actions: [ + 'sts:AssumeRole', + ], + resources: new Array(...alarmHistoryRoles), + }), + ); + } + + if (alarmArns.size > 0) { + enforcerFunction.addToRolePolicy( + new iam.PolicyStatement({ + effect: iam.Effect.ALLOW, + actions: [ + 'cloudwatch:DescribeAlarms', + ], + resources: ['*'], // IAM requires this level for given operation + }), + ); + } } const bakeStepSettings: Record = {}; Object.entries(bakeSteps).forEach(([actionName, settings]) => { bakeStepSettings[actionName] = { bakeTimeMillis: settings.bakeTime.toMilliseconds(), + alarmSettings: (settings.rejectOnAlarms ?? []).map((s) => ({ + alarmName: s.alarm.alarmName, + region: s.alarm.alarmArn.split(':')[3], + assumeRoleArn: s.assumeRole?.roleArn, + treatMissingAlarm: s.treatMissingAlarm ?? 'REJECT', + })), }; }); diff --git a/src/common.ts b/src/common.ts index 83bfba4..11f013d 100644 --- a/src/common.ts +++ b/src/common.ts @@ -1,3 +1,32 @@ +export interface BakeStepAlarmSettings { + /** + * The name of the alarm to monitor. + */ + readonly alarmName: string; + + /** + * The region for the alarm. + */ + readonly region: string; + + /** + * Role to assume in order to describe the alarm history. + * + * For cross-account support, first create this role in the target account + * and add trust policy that trusts the pipeline account to assume it. + */ + readonly assumeRoleArn?: string; + + /** + * Specify approval behavior if the alarm cannot be described. + * + * Default: `REJECT`. Set to `IGNORE` if the alarm may not yet be created. + * Note that failure to assume the role (if applicable) may also result in a + * rejected approval. + */ + readonly treatMissingAlarm?: 'IGNORE' | 'REJECT'; +} + /** * Settings for `CodePipelineHelper.newBakeStep`. */ @@ -6,6 +35,11 @@ export interface BakeStepSettings { * How long to wait before approving the step. */ readonly bakeTimeMillis: number; + + /** + * Optionally watch the given alarm and reject if it fires. + */ + readonly alarmSettings?: BakeStepAlarmSettings[]; } export interface DeploymentSafetySettings { diff --git a/test/CodePipelineHelper.test.ts b/test/CodePipelineHelper.test.ts index af92a7d..4a170c6 100644 --- a/test/CodePipelineHelper.test.ts +++ b/test/CodePipelineHelper.test.ts @@ -1,5 +1,6 @@ import { App, Duration, Stack, Stage } from 'aws-cdk-lib'; import { Match, Template } from 'aws-cdk-lib/assertions'; +import { Alarm } from 'aws-cdk-lib/aws-cloudwatch'; import { CodePipeline, CodePipelineSource, ShellStep } from 'aws-cdk-lib/pipelines'; import { Construct } from 'constructs'; import { CodePipelineHelper } from '../src/index'; @@ -28,6 +29,15 @@ wave1.addStage(new MockStage(stack, 'FirstStage'), { post: [ tester.newBakeStep('Bake-FirstStage', { bakeTime: Duration.hours(2), + rejectOnAlarms: [ + { + alarm: Alarm.fromAlarmArn( + stack, + 'RollbackAlarm', + 'arn:aws:cloudwatch:us-west-2:000011112222:alarm:Rollback', + ), + }, + ], }), ], });