diff --git a/github-runners/terraform/autoscaling/gh-runners/ephemeral-linux/main.tf b/github-runners/terraform/autoscaling/gh-runners/ephemeral-linux/main.tf index 4c4ce2f4..8c6ea8c6 100644 --- a/github-runners/terraform/autoscaling/gh-runners/ephemeral-linux/main.tf +++ b/github-runners/terraform/autoscaling/gh-runners/ephemeral-linux/main.tf @@ -44,7 +44,7 @@ module "runners" { enable_ssm_on_runners = true # Let the module manage the service linked role - # create_service_linked_role_spot = true + create_service_linked_role_spot = true instance_types = ["m6a.4xlarge", "c6a.4xlarge"] @@ -60,20 +60,19 @@ module "runners" { enable_ephemeral_runners = true # # Example of simple pool usages - # pool_runner_owner = "subspace-scale-runners" - # pool_config = [{ - # size = 3 - # schedule_expression = "cron(* * * * ? *)" - # }] - # - # + pool_runner_owner = "subspace-scale-runners" + pool_config = [{ + size = 3 + schedule_expression = "cron(* * * * ? *)" + }] + enable_job_queued_check = true - # tracing_config = { - # mode = "Active" - # capture_error = true - # capture_http_requests = true - # } + tracing_config = { + mode = "Active" + capture_error = true + capture_http_requests = true + } # AMI selection and userdata diff --git a/github-runners/terraform/autoscaling/gh-runners/ephemeral-windows/main.tf b/github-runners/terraform/autoscaling/gh-runners/ephemeral-windows/main.tf index b3bcb397..d2805522 100644 --- a/github-runners/terraform/autoscaling/gh-runners/ephemeral-windows/main.tf +++ b/github-runners/terraform/autoscaling/gh-runners/ephemeral-windows/main.tf @@ -44,7 +44,7 @@ module "runners" { enable_ssm_on_runners = true # Let the module manage the service linked role - # create_service_linked_role_spot = true + create_service_linked_role_spot = true instance_types = ["m6a.4xlarge", "c6a.4xlarge"] @@ -60,20 +60,19 @@ module "runners" { enable_ephemeral_runners = true # # Example of simple pool usages - # pool_runner_owner = "subspace-scale-runners" - # pool_config = [{ - # size = 3 - # schedule_expression = "cron(* * * * ? *)" - # }] - # - # + pool_runner_owner = "subspace-scale-runners" + pool_config = [{ + size = 3 + schedule_expression = "cron(* * * * ? *)" + }] + enable_job_queued_check = true - # tracing_config = { - # mode = "Active" - # capture_error = true - # capture_http_requests = true - # } + tracing_config = { + mode = "Active" + capture_error = true + capture_http_requests = true + } # configure your pre-built AMI diff --git a/github-runners/terraform/autoscaling/gh-runners/multi-runner/main.tf b/github-runners/terraform/autoscaling/gh-runners/multi-runner/main.tf index 6b3dfa4f..9dce6304 100644 --- a/github-runners/terraform/autoscaling/gh-runners/multi-runner/main.tf +++ b/github-runners/terraform/autoscaling/gh-runners/multi-runner/main.tf @@ -41,28 +41,28 @@ module "runners" { source = "../../modules/multi-runner" multi_runner_config = local.multi_runner_config # Alternative to loading runner configuration from Yaml files is using static configuration: - multi_runner_config = { - "linux-x64" = { - matcherConfig : { - labelMatchers = [["self-hosted", "linux", "x64", "ubuntu-2204"], ["self-hosted", "linux", "x64", "ubuntu-latest"]] - exactMatch = true - } - fifo = true - delay_webhook_event = 0 - runner_config = { - runner_os = "linux" - runner_architecture = "x64" - runner_name_prefix = "ubuntu-x64_" - create_service_linked_role_spot = true - enable_ssm_on_runners = true - instance_types = ["m6a.2xlarge", "m6a.4xlarge"] - runner_extra_labels = ["ubuntu-22.04", "ubuntu"] - runners_maximum_count = 3 - enable_ephemeral_runners = true - scale_down_schedule_expression = "cron(* * * * ? *)" - } - } - } + # multi_runner_config = { + # "linux-x64" = { + # matcherConfig : { + # labelMatchers = [["self-hosted", "linux", "x64", "ubuntu-2204"], ["self-hosted", "linux", "x64", "ubuntu-latest"]] + # exactMatch = true + # } + # fifo = true + # delay_webhook_event = 0 + # runner_config = { + # runner_os = "linux" + # runner_architecture = "x64" + # runner_name_prefix = "ubuntu-x64_" + # create_service_linked_role_spot = true + # enable_ssm_on_runners = true + # instance_types = ["m6a.2xlarge", "m6a.4xlarge"] + # runner_extra_labels = ["ubuntu-22.04", "ubuntu"] + # runners_maximum_count = 3 + # enable_ephemeral_runners = true + # scale_down_schedule_expression = "cron(* * * * ? *)" + # } + # } + # } aws_region = local.aws_region vpc_id = module.base.vpc.vpc_id subnet_ids = module.base.vpc.private_subnets @@ -78,35 +78,35 @@ module "runners" { webhook_secret = random_id.random.hex } # enable this section for tracing - # tracing_config = { - # mode = "Active" - # capture_error = true - # capture_http_requests = true - # } + tracing_config = { + mode = "Active" + capture_error = true + capture_http_requests = true + } # Assuming local build lambda's to use pre build ones, uncomment the lines below and download the # lambda zip files lambda_download - # webhook_lambda_zip = "../lambdas-download/webhook.zip" - # runner_binaries_syncer_lambda_zip = "../lambdas-download/runner-binaries-syncer.zip" - # runners_lambda_zip = "../lambdas-download/runners.zip" + webhook_lambda_zip = "../lambdas-download/webhook.zip" + runner_binaries_syncer_lambda_zip = "../lambdas-download/runner-binaries-syncer.zip" + runners_lambda_zip = "../lambdas-download/runners.zip" - # enable_workflow_job_events_queue = true + enable_workflow_job_events_queue = true # override delay of events in seconds # Enable debug logging for the lambda functions - # log_level = "debug" + log_level = "debug" # Enable spot termination watcher - # spot_instance_termination_watcher = { - # enable = true - # } + spot_instance_termination_watcher = { + enable = true + } # Enable to track the spot instance termination warning - # instance_termination_watcher = { - # enable = true - # enable_metric = { - # spot_warning = true - # } - # } + instance_termination_watcher = { + enable = true + enable_metric = { + spot_warning = true + } + } } module "webhook_github_app" { diff --git a/github-runners/terraform/autoscaling/modules/multi-runner/README.md b/github-runners/terraform/autoscaling/modules/multi-runner/README.md new file mode 100644 index 00000000..09517e03 --- /dev/null +++ b/github-runners/terraform/autoscaling/modules/multi-runner/README.md @@ -0,0 +1,199 @@ +# Module - Multi runner + +> This module replaces the top-level module to make it easy to create with one deployment multiple type of runners. + +This module creates many runners with a single GitHub app. The module utilizes the internal modules and deploys parts of the stack for each runner defined. + +The module takes a configuration as input containing a matcher for the labels. The [webhook](https://philips-labs.github.io/terraform-aws-github-runner/modules/internal/webhook/) lambda is using the configuration to delegate events based on the labels in the workflow job and sent them to a dedicated queue based on the configuration. Events on each queue are processed by a dedicated lambda per configuration to scale runners. + +For each configuration: + +- When enabled, the [distribution syncer](https://philips-labs.github.io/terraform-aws-github-runner/modules/internal/runner-binaries-syncer/) is deployed for each unique combination of OS and architecture. +- For each configuration a queue is created and [runner module](https://philips-labs.github.io/terraform-aws-github-runner/modules/internal/runners/) is deployed + +## Matching + +Matching of the configuration is done based on the labels specified in labelMatchers configuration. The webhook is processing the `workflow_job` event and match the labels against the labels specified in labelMatchers configuration in the order of configuration with exact-match true first, followed by all exact matches false. + +## The catch + +Controlling which event is taken up by which runner is not to this module. It is completely done by GitHub. This means when potentially different runners can run the same job there is nothing that can be done to guarantee a certain runner will take up the job. + +An example, given you have two runners one with the labels. `self-hosted, linux, x64, large` and one with the labels `self-hosted, linux, x64, small`. Once you define a subset of the labels in the workflow, for example `self-hosted, linux, x64`. Both runners can take the job potentially. You can define to scale one of the runners for the event, but still there is no guarantee that the scaled runner takes the job. The workflow with subset of labels (`self-hosted, linux, x64`) can take up runner with specific labels (`self-hosted, linux, x64, large`) and leave the workflow with labels (`self-hosted, linux, x64, large`) be without the runner. +The only mitigation that is available right now is to use a small pool of runners. Pool instances can also exist for a short amount of time and only created once in x time based on a cron expression. + +Jobs not defining all all labels but for example only `[self-hosted, linux]` could be matched to potentially different runners. The matcher scales the first runner that matches. With the attribute `priority` the order of matchers can be defined. + +## Usages + +A complete example is available in the examples, see the [multi-runner example](https://philips-labs.github.io/terraform-aws-github-runner/examples/) for actual implementation. + +```hcl + +module "multi-runner" { + prefix = "multi-runner" + + github_app = { + # app details + } + + multi_runner_config = { + "linux-arm" = { + matcherConfig : { + labelMatchers = [["self-hosted", "linux", "arm64", "arm"]] + exactMatch = true + } + runner_config = { + runner_os = "linux" + runner_architecture = "arm64" + runner_extra_labels = "arm" + enable_ssm_on_runners = true + instance_types = ["t4g.large", "c6g.large"] + ... + } + ... + }, + "linux-x64" = { + matcherConfig : { + labelMatchers = [["self-hosted", "linux", "x64"]] + exactMatch = false + } + runner_config = { + runner_os = "linux" + runner_architecture = "x64" + instance_types = ["m5ad.large", "m5a.large"] + enable_ephemeral_runners = true + delay_webhook_event = 0 + ... + } + ... + } + } + +} +``` + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.3 | +| [aws](#requirement\_aws) | ~> 5.27 | +| [random](#requirement\_random) | ~> 3.0 | + +## Providers + +| Name | Version | +|------|---------| +| [aws](#provider\_aws) | ~> 5.27 | +| [random](#provider\_random) | ~> 3.0 | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [ami\_housekeeper](#module\_ami\_housekeeper) | ../ami-housekeeper | n/a | +| [instance\_termination\_watcher](#module\_instance\_termination\_watcher) | ../termination-watcher | n/a | +| [runner\_binaries](#module\_runner\_binaries) | ../runner-binaries-syncer | n/a | +| [runners](#module\_runners) | ../runners | n/a | +| [ssm](#module\_ssm) | ../ssm | n/a | +| [webhook](#module\_webhook) | ../webhook | n/a | + +## Resources + +| Name | Type | +|------|------| +| [aws_sqs_queue.queued_builds](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sqs_queue) | resource | +| [aws_sqs_queue.queued_builds_dlq](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sqs_queue) | resource | +| [aws_sqs_queue.webhook_events_workflow_job_queue](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sqs_queue) | resource | +| [aws_sqs_queue_policy.build_queue_dlq_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sqs_queue_policy) | resource | +| [aws_sqs_queue_policy.build_queue_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sqs_queue_policy) | resource | +| [random_string.random](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/string) | resource | +| [aws_iam_policy_document.deny_unsecure_transport](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [ami\_housekeeper\_cleanup\_config](#input\_ami\_housekeeper\_cleanup\_config) | Configuration for AMI cleanup. |
object({| `{}` | no | +| [ami\_housekeeper\_lambda\_memory\_size](#input\_ami\_housekeeper\_lambda\_memory\_size) | Memory size linit in MB of the lambda. | `number` | `256` | no | +| [ami\_housekeeper\_lambda\_s3\_key](#input\_ami\_housekeeper\_lambda\_s3\_key) | S3 key for syncer lambda function. Required if using S3 bucket to specify lambdas. | `string` | `null` | no | +| [ami\_housekeeper\_lambda\_s3\_object\_version](#input\_ami\_housekeeper\_lambda\_s3\_object\_version) | S3 object version for syncer lambda function. Useful if S3 versioning is enabled on source bucket. | `string` | `null` | no | +| [ami\_housekeeper\_lambda\_schedule\_expression](#input\_ami\_housekeeper\_lambda\_schedule\_expression) | Scheduler expression for action runner binary syncer. | `string` | `"cron(11 7 * * ? *)"` | no | +| [ami\_housekeeper\_lambda\_timeout](#input\_ami\_housekeeper\_lambda\_timeout) | Time out of the lambda in seconds. | `number` | `300` | no | +| [ami\_housekeeper\_lambda\_zip](#input\_ami\_housekeeper\_lambda\_zip) | File location of the lambda zip file. | `string` | `null` | no | +| [associate\_public\_ipv4\_address](#input\_associate\_public\_ipv4\_address) | Associate public IPv4 with the runner. Only tested with IPv4 | `bool` | `false` | no | +| [aws\_partition](#input\_aws\_partition) | (optiona) partition in the arn namespace to use if not 'aws' | `string` | `"aws"` | no | +| [aws\_region](#input\_aws\_region) | AWS region. | `string` | n/a | yes | +| [cloudwatch\_config](#input\_cloudwatch\_config) | (optional) Replaces the module default cloudwatch log config. See https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch-Agent-Configuration-File-Details.html for details. | `string` | `null` | no | +| [enable\_ami\_housekeeper](#input\_enable\_ami\_housekeeper) | Option to disable the lambda to clean up old AMIs. | `bool` | `false` | no | +| [enable\_managed\_runner\_security\_group](#input\_enable\_managed\_runner\_security\_group) | Enabling the default managed security group creation. Unmanaged security groups can be specified via `runner_additional_security_group_ids`. | `bool` | `true` | no | +| [enable\_workflow\_job\_events\_queue](#input\_enable\_workflow\_job\_events\_queue) | Enabling this experimental feature will create a secondory sqs queue to wich a copy of the workflow\_job event will be delivered. | `bool` | `false` | no | +| [ghes\_ssl\_verify](#input\_ghes\_ssl\_verify) | GitHub Enterprise SSL verification. Set to 'false' when custom certificate (chains) is used for GitHub Enterprise Server (insecure). | `bool` | `true` | no | +| [ghes\_url](#input\_ghes\_url) | GitHub Enterprise Server URL. Example: https://github.internal.co - DO NOT SET IF USING PUBLIC GITHUB | `string` | `null` | no | +| [github\_app](#input\_github\_app) | GitHub app parameters, see your github app. Ensure the key is the base64-encoded `.pem` file (the output of `base64 app.private-key.pem`, not the content of `private-key.pem`). |
maxItems = optional(number)
minimumDaysOld = optional(number)
amiFilters = optional(list(object({
Name = string
Values = list(string)
})))
launchTemplateNames = optional(list(string))
ssmParameterNames = optional(list(string))
dryRun = optional(bool)
})
object({| n/a | yes | +| [instance\_profile\_path](#input\_instance\_profile\_path) | The path that will be added to the instance\_profile, if not set the environment name will be used. | `string` | `null` | no | +| [instance\_termination\_watcher](#input\_instance\_termination\_watcher) | Configuration for the spot termination watcher lambda function. This feature is Beta, changes will not trigger a major release as long in beta.
key_base64 = string
id = string
webhook_secret = string
})
object({| `{}` | no | +| [key\_name](#input\_key\_name) | Key pair name | `string` | `null` | no | +| [kms\_key\_arn](#input\_kms\_key\_arn) | Optional CMK Key ARN to be used for Parameter Store. | `string` | `null` | no | +| [lambda\_architecture](#input\_lambda\_architecture) | AWS Lambda architecture. Lambda functions using Graviton processors ('arm64') tend to have better price/performance than 'x86\_64' functions. | `string` | `"arm64"` | no | +| [lambda\_principals](#input\_lambda\_principals) | (Optional) add extra principals to the role created for execution of the lambda, e.g. for local testing. |
enable = optional(bool, false)
enable_metric = optional(object({
spot_warning = optional(bool, false)
}))
memory_size = optional(number, null)
s3_key = optional(string, null)
s3_object_version = optional(string, null)
timeout = optional(number, null)
zip = optional(string, null)
})
list(object({| `[]` | no | +| [lambda\_runtime](#input\_lambda\_runtime) | AWS Lambda runtime. | `string` | `"nodejs20.x"` | no | +| [lambda\_s3\_bucket](#input\_lambda\_s3\_bucket) | S3 bucket from which to specify lambda functions. This is an alternative to providing local files directly. | `string` | `null` | no | +| [lambda\_security\_group\_ids](#input\_lambda\_security\_group\_ids) | List of security group IDs associated with the Lambda function. | `list(string)` | `[]` | no | +| [lambda\_subnet\_ids](#input\_lambda\_subnet\_ids) | List of subnets in which the action runners will be launched, the subnets needs to be subnets in the `vpc_id`. | `list(string)` | `[]` | no | +| [log\_level](#input\_log\_level) | Logging level for lambda logging. Valid values are 'silly', 'trace', 'debug', 'info', 'warn', 'error', 'fatal'. | `string` | `"info"` | no | +| [logging\_kms\_key\_id](#input\_logging\_kms\_key\_id) | Specifies the kms key id to encrypt the logs with | `string` | `null` | no | +| [logging\_retention\_in\_days](#input\_logging\_retention\_in\_days) | Specifies the number of days you want to retain log events for the lambda log group. Possible values are: 0, 1, 3, 5, 7, 14, 30, 60, 90, 120, 150, 180, 365, 400, 545, 731, 1827, and 3653. | `number` | `180` | no | +| [metrics\_namespace](#input\_metrics\_namespace) | The namespace for the metrics created by the module. Merics will only be created if explicit enabled. | `string` | `"GitHub Runners"` | no | +| [multi\_runner\_config](#input\_multi\_runner\_config) | multi\_runner\_config = {
type = string
identifiers = list(string)
}))
map(object({| n/a | yes | +| [pool\_lambda\_reserved\_concurrent\_executions](#input\_pool\_lambda\_reserved\_concurrent\_executions) | Amount of reserved concurrent executions for the scale-up lambda function. A value of 0 disables lambda from being triggered and -1 removes any concurrency limitations. | `number` | `1` | no | +| [pool\_lambda\_timeout](#input\_pool\_lambda\_timeout) | Time out for the pool lambda in seconds. | `number` | `60` | no | +| [prefix](#input\_prefix) | The prefix used for naming resources | `string` | `"github-actions"` | no | +| [queue\_encryption](#input\_queue\_encryption) | Configure how data on queues managed by the modules in ecrypted at REST. Options are encryped via SSE, non encrypted and via KMSS. By default encryptes via SSE is enabled. See for more details the Terraform `aws_sqs_queue` resource https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sqs_queue. |
runner_config = object({
runner_os = string
runner_architecture = string
runner_metadata_options = optional(map(any), {
instance_metadata_tags = "enabled"
http_endpoint = "enabled"
http_tokens = "required"
http_put_response_hop_limit = 1
})
ami_filter = optional(map(list(string)), { state = ["available"] })
ami_owners = optional(list(string), ["amazon"])
ami_id_ssm_parameter_name = optional(string, null)
ami_kms_key_arn = optional(string, "")
create_service_linked_role_spot = optional(bool, false)
credit_specification = optional(string, null)
delay_webhook_event = optional(number, 30)
disable_runner_autoupdate = optional(bool, false)
ebs_optimized = optional(bool, false)
enable_ephemeral_runners = optional(bool, false)
enable_job_queued_check = optional(bool, null)
enable_on_demand_failover_for_errors = optional(list(string), [])
enable_organization_runners = optional(bool, false)
enable_runner_binaries_syncer = optional(bool, true)
enable_ssm_on_runners = optional(bool, false)
enable_userdata = optional(bool, true)
instance_allocation_strategy = optional(string, "lowest-price")
instance_max_spot_price = optional(string, null)
instance_target_capacity_type = optional(string, "spot")
instance_types = list(string)
job_queue_retention_in_seconds = optional(number, 86400)
minimum_running_time_in_minutes = optional(number, null)
pool_runner_owner = optional(string, null)
runner_as_root = optional(bool, false)
runner_boot_time_in_minutes = optional(number, 5)
runner_extra_labels = optional(list(string), [])
runner_group_name = optional(string, "Default")
runner_name_prefix = optional(string, "")
runner_run_as = optional(string, "ec2-user")
runners_maximum_count = number
runner_additional_security_group_ids = optional(list(string), [])
scale_down_schedule_expression = optional(string, "cron(*/5 * * * ? *)")
scale_up_reserved_concurrent_executions = optional(number, 1)
userdata_template = optional(string, null)
userdata_content = optional(string, null)
enable_jit_config = optional(bool, null)
enable_runner_detailed_monitoring = optional(bool, false)
enable_cloudwatch_agent = optional(bool, true)
cloudwatch_config = optional(string, null)
userdata_pre_install = optional(string, "")
userdata_post_install = optional(string, "")
runner_ec2_tags = optional(map(string), {})
runner_iam_role_managed_policy_arns = optional(list(string), [])
vpc_id = optional(string, null)
subnet_ids = optional(list(string), null)
idle_config = optional(list(object({
cron = string
timeZone = string
idleCount = number
evictionStrategy = optional(string, "oldest_first")
})), [])
runner_log_files = optional(list(object({
log_group_name = string
prefix_log_group = bool
file_path = string
log_stream_name = string
})), null)
block_device_mappings = optional(list(object({
delete_on_termination = optional(bool, true)
device_name = optional(string, "/dev/xvda")
encrypted = optional(bool, true)
iops = optional(number)
kms_key_id = optional(string)
snapshot_id = optional(string)
throughput = optional(number)
volume_size = number
volume_type = optional(string, "gp3")
})), [{
volume_size = 30
}])
pool_config = optional(list(object({
schedule_expression = string
size = number
})), [])
})
matcherConfig = object({
labelMatchers = list(list(string))
exactMatch = optional(bool, false)
priority = optional(number, 999)
})
fifo = optional(bool, false)
redrive_build_queue = optional(object({
enabled = bool
maxReceiveCount = number
}), {
enabled = false
maxReceiveCount = null
})
}))
object({|
kms_data_key_reuse_period_seconds = number
kms_master_key_id = string
sqs_managed_sse_enabled = bool
})
{| no | +| [repository\_white\_list](#input\_repository\_white\_list) | List of github repository full names (owner/repo\_name) that will be allowed to use the github app. Leave empty for no filtering. | `list(string)` | `[]` | no | +| [role\_path](#input\_role\_path) | The path that will be added to the role; if not set, the environment name will be used. | `string` | `null` | no | +| [role\_permissions\_boundary](#input\_role\_permissions\_boundary) | Permissions boundary that will be added to the created role for the lambda. | `string` | `null` | no | +| [runner\_additional\_security\_group\_ids](#input\_runner\_additional\_security\_group\_ids) | (optional) List of additional security groups IDs to apply to the runner | `list(string)` | `[]` | no | +| [runner\_binaries\_s3\_sse\_configuration](#input\_runner\_binaries\_s3\_sse\_configuration) | Map containing server-side encryption configuration for runner-binaries S3 bucket. | `any` |
"kms_data_key_reuse_period_seconds": null,
"kms_master_key_id": null,
"sqs_managed_sse_enabled": true
}
{| no | +| [runner\_binaries\_s3\_versioning](#input\_runner\_binaries\_s3\_versioning) | Status of S3 versioning for runner-binaries S3 bucket. Once set to Enabled the change cannot be reverted via Terraform! | `string` | `"Disabled"` | no | +| [runner\_binaries\_syncer\_lambda\_timeout](#input\_runner\_binaries\_syncer\_lambda\_timeout) | Time out of the binaries sync lambda in seconds. | `number` | `300` | no | +| [runner\_binaries\_syncer\_lambda\_zip](#input\_runner\_binaries\_syncer\_lambda\_zip) | File location of the binaries sync lambda zip file. | `string` | `null` | no | +| [runner\_binaries\_syncer\_memory\_size](#input\_runner\_binaries\_syncer\_memory\_size) | Memory size limit in MB for binary syncer lambda. | `number` | `256` | no | +| [runner\_egress\_rules](#input\_runner\_egress\_rules) | List of egress rules for the GitHub runner instances. |
"rule": {
"apply_server_side_encryption_by_default": {
"sse_algorithm": "AES256"
}
}
}
list(object({|
cidr_blocks = list(string)
ipv6_cidr_blocks = list(string)
prefix_list_ids = list(string)
from_port = number
protocol = string
security_groups = list(string)
self = bool
to_port = number
description = string
}))
[| no | +| [runners\_lambda\_s3\_key](#input\_runners\_lambda\_s3\_key) | S3 key for runners lambda function. Required if using S3 bucket to specify lambdas. | `string` | `null` | no | +| [runners\_lambda\_s3\_object\_version](#input\_runners\_lambda\_s3\_object\_version) | S3 object version for runners lambda function. Useful if S3 versioning is enabled on source bucket. | `string` | `null` | no | +| [runners\_lambda\_zip](#input\_runners\_lambda\_zip) | File location of the lambda zip file for scaling runners. | `string` | `null` | no | +| [runners\_scale\_down\_lambda\_timeout](#input\_runners\_scale\_down\_lambda\_timeout) | Time out for the scale down lambda in seconds. | `number` | `60` | no | +| [runners\_scale\_up\_lambda\_timeout](#input\_runners\_scale\_up\_lambda\_timeout) | Time out for the scale up lambda in seconds. | `number` | `30` | no | +| [runners\_ssm\_housekeeper](#input\_runners\_ssm\_housekeeper) | Configuration for the SSM housekeeper lambda. This lambda deletes token / JIT config from SSM.
{
"cidr_blocks": [
"0.0.0.0/0"
],
"description": null,
"from_port": 0,
"ipv6_cidr_blocks": [
"::/0"
],
"prefix_list_ids": null,
"protocol": "-1",
"security_groups": null,
"self": null,
"to_port": 0
}
]
object({|
schedule_expression = optional(string, "rate(1 day)")
enabled = optional(bool, true)
lambda_memory_size = optional(number, 512)
lambda_timeout = optional(number, 60)
config = object({
tokenPath = optional(string)
minimumDaysOld = optional(number, 1)
dryRun = optional(bool, false)
})
})
{| no | +| [scale\_down\_lambda\_memory\_size](#input\_scale\_down\_lambda\_memory\_size) | Memory size limit in MB for scale down. | `number` | `512` | no | +| [scale\_up\_lambda\_memory\_size](#input\_scale\_up\_lambda\_memory\_size) | Memory size limit in MB for scale\_up lambda. | `number` | `512` | no | +| [ssm\_paths](#input\_ssm\_paths) | The root path used in SSM to store configuration and secreets. |
"config": {}
}
object({| `{}` | no | +| [state\_event\_rule\_binaries\_syncer](#input\_state\_event\_rule\_binaries\_syncer) | Option to disable EventBridge Lambda trigger for the binary syncer, useful to stop automatic updates of binary distribution | `string` | `"ENABLED"` | no | +| [subnet\_ids](#input\_subnet\_ids) | List of subnets in which the action runners will be launched, the subnets needs to be subnets in the `vpc_id`. | `list(string)` | n/a | yes | +| [syncer\_lambda\_s3\_key](#input\_syncer\_lambda\_s3\_key) | S3 key for syncer lambda function. Required if using S3 bucket to specify lambdas. | `string` | `null` | no | +| [syncer\_lambda\_s3\_object\_version](#input\_syncer\_lambda\_s3\_object\_version) | S3 object version for syncer lambda function. Useful if S3 versioning is enabled on source bucket. | `string` | `null` | no | +| [tags](#input\_tags) | Map of tags that will be added to created resources. By default resources will be tagged with name and environment. | `map(string)` | `{}` | no | +| [tracing\_config](#input\_tracing\_config) | Configuration for lambda tracing. |
root = optional(string, "github-action-runners")
app = optional(string, "app")
runners = optional(string, "runners")
webhook = optional(string, "webhook")
})
object({| `{}` | no | +| [vpc\_id](#input\_vpc\_id) | The VPC for security groups of the action runners. | `string` | n/a | yes | +| [webhook\_lambda\_apigateway\_access\_log\_settings](#input\_webhook\_lambda\_apigateway\_access\_log\_settings) | Access log settings for webhook API gateway. |
mode = optional(string, null)
capture_http_requests = optional(bool, false)
capture_error = optional(bool, false)
})
object({| `null` | no | +| [webhook\_lambda\_memory\_size](#input\_webhook\_lambda\_memory\_size) | Memory size limit in MB for webhook lambda. | `number` | `256` | no | +| [webhook\_lambda\_s3\_key](#input\_webhook\_lambda\_s3\_key) | S3 key for webhook lambda function. Required if using S3 bucket to specify lambdas. | `string` | `null` | no | +| [webhook\_lambda\_s3\_object\_version](#input\_webhook\_lambda\_s3\_object\_version) | S3 object version for webhook lambda function. Useful if S3 versioning is enabled on source bucket. | `string` | `null` | no | +| [webhook\_lambda\_timeout](#input\_webhook\_lambda\_timeout) | Time out of the lambda in seconds. | `number` | `10` | no | +| [webhook\_lambda\_zip](#input\_webhook\_lambda\_zip) | File location of the webhook lambda zip file. | `string` | `null` | no | +| [workflow\_job\_queue\_configuration](#input\_workflow\_job\_queue\_configuration) | Configuration options for workflow job queue which is only applicable if the flag enable\_workflow\_job\_events\_queue is set to true. |
destination_arn = string
format = string
})
object({|
delay_seconds = number
visibility_timeout_seconds = number
message_retention_seconds = number
})
{| no | + +## Outputs + +| Name | Description | +|------|-------------| +| [binaries\_syncer\_map](#output\_binaries\_syncer\_map) | n/a | +| [instance\_termination\_watcher](#output\_instance\_termination\_watcher) | n/a | +| [queues](#output\_queues) | SQS queues. | +| [runners\_map](#output\_runners\_map) | n/a | +| [ssm\_parameters](#output\_ssm\_parameters) | n/a | +| [webhook](#output\_webhook) | n/a | + diff --git a/github-runners/terraform/autoscaling/modules/multi-runner/ami-housekeeper.tf b/github-runners/terraform/autoscaling/modules/multi-runner/ami-housekeeper.tf new file mode 100644 index 00000000..dea5b0ab --- /dev/null +++ b/github-runners/terraform/autoscaling/modules/multi-runner/ami-housekeeper.tf @@ -0,0 +1,33 @@ + +module "ami_housekeeper" { + count = var.enable_ami_housekeeper ? 1 : 0 + source = "../ami-housekeeper" + + prefix = var.prefix + tags = local.tags + aws_partition = var.aws_partition + + lambda_zip = var.ami_housekeeper_lambda_zip + lambda_s3_bucket = var.lambda_s3_bucket + lambda_s3_key = var.ami_housekeeper_lambda_s3_key + lambda_s3_object_version = var.ami_housekeeper_lambda_s3_object_version + + lambda_architecture = var.lambda_architecture + lambda_principals = var.lambda_principals + lambda_runtime = var.lambda_runtime + lambda_security_group_ids = var.lambda_security_group_ids + lambda_subnet_ids = var.lambda_subnet_ids + lambda_memory_size = var.ami_housekeeper_lambda_memory_size + lambda_timeout = var.ami_housekeeper_lambda_timeout + tracing_config = var.tracing_config + + logging_retention_in_days = var.logging_retention_in_days + logging_kms_key_id = var.logging_kms_key_id + log_level = var.log_level + + role_path = var.role_path + role_permissions_boundary = var.role_permissions_boundary + + cleanup_config = var.ami_housekeeper_cleanup_config + lambda_schedule_expression = var.ami_housekeeper_lambda_schedule_expression +} diff --git a/github-runners/terraform/autoscaling/modules/multi-runner/main.tf b/github-runners/terraform/autoscaling/modules/multi-runner/main.tf new file mode 100644 index 00000000..68589248 --- /dev/null +++ b/github-runners/terraform/autoscaling/modules/multi-runner/main.tf @@ -0,0 +1,25 @@ +locals { + tags = merge(var.tags, { + "ghr:environment" = var.prefix + }) + + github_app_parameters = { + id = module.ssm.parameters.github_app_id + key_base64 = module.ssm.parameters.github_app_key_base64 + } + + runner_extra_labels = { for k, v in var.multi_runner_config : k => sort(setunion(flatten(v.matcherConfig.labelMatchers), compact(v.runner_config.runner_extra_labels))) } + + runner_config = { for k, v in var.multi_runner_config : k => merge({ id = aws_sqs_queue.queued_builds[k].id, arn = aws_sqs_queue.queued_builds[k].arn }, merge(v, { runner_config = merge(v.runner_config, { runner_extra_labels = local.runner_extra_labels[k] }) })) } + + tmp_distinct_list_unique_os_and_arch = distinct([for i, config in local.runner_config : { "os_type" : config.runner_config.runner_os, "architecture" : config.runner_config.runner_architecture } if config.runner_config.enable_runner_binaries_syncer]) + unique_os_and_arch = { for i, v in local.tmp_distinct_list_unique_os_and_arch : "${v.os_type}_${v.architecture}" => v } + + ssm_root_path = "/${var.ssm_paths.root}/${var.prefix}" +} + +resource "random_string" "random" { + length = 24 + special = false + upper = false +} diff --git a/github-runners/terraform/autoscaling/modules/multi-runner/outputs.tf b/github-runners/terraform/autoscaling/modules/multi-runner/outputs.tf new file mode 100644 index 00000000..47363ef3 --- /dev/null +++ b/github-runners/terraform/autoscaling/modules/multi-runner/outputs.tf @@ -0,0 +1,61 @@ + +output "runners_map" { + value = { for runner_key, runner in module.runners : runner_key => { + launch_template_name = runner.launch_template.name + launch_template_id = runner.launch_template.id + launch_template_version = runner.launch_template.latest_version + launch_template_ami_id = runner.launch_template.image_id + lambda_up = runner.lambda_scale_up + lambda_up_log_group = runner.lambda_scale_up_log_group + lambda_down = runner.lambda_scale_down + lambda_down_log_group = runner.lambda_scale_down_log_group + lambda_pool = runner.lambda_pool + lambda_pool_log_group = runner.lambda_pool_log_group + role_runner = runner.role_runner + role_scale_up = runner.role_scale_up + role_scale_down = runner.role_scale_down + role_pool = runner.role_pool + runners_log_groups = runner.runners_log_groups + logfiles = runner.logfiles + } + } +} + +output "binaries_syncer_map" { + value = { for runner_binary_key, runner_binary in module.runner_binaries : runner_binary_key => { + lambda = runner_binary.lambda + lambda_log_group = runner_binary.lambda_log_group + lambda_role = runner_binary.lambda_role + location = "s3://runner_binary.bucket.id}/runner_binary.bucket.key" + bucket = runner_binary.bucket + } } +} + +output "webhook" { + value = { + gateway = module.webhook.gateway + lambda = module.webhook.lambda + lambda_log_group = module.webhook.lambda_log_group + lambda_role = module.webhook.role + endpoint = "${module.webhook.gateway.api_endpoint}/${module.webhook.endpoint_relative_path}" + } +} + +output "ssm_parameters" { + value = module.ssm.parameters +} + +output "queues" { + description = "SQS queues." + value = { + webhook_workflow_job_queue = try(aws_sqs_queue.webhook_events_workflow_job_queue[*].arn, "") + } +} + +output "instance_termination_watcher" { + value = var.instance_termination_watcher.enable ? { + lambda = module.instance_termination_watcher[0].lambda.function + lambda_log_group = module.instance_termination_watcher[0].lambda.log_group + lambda_role = module.instance_termination_watcher[0].lambda.role + } : null +} diff --git a/github-runners/terraform/autoscaling/modules/multi-runner/queues.tf b/github-runners/terraform/autoscaling/modules/multi-runner/queues.tf new file mode 100644 index 00000000..a7a8100e --- /dev/null +++ b/github-runners/terraform/autoscaling/modules/multi-runner/queues.tf @@ -0,0 +1,91 @@ + +data "aws_iam_policy_document" "deny_unsecure_transport" { + statement { + sid = "DenyUnsecureTransport" + + effect = "Deny" + + principals { + type = "AWS" + identifiers = ["*"] + } + + actions = [ + "sqs:*" + ] + + resources = [ + "*" + ] + + condition { + test = "Bool" + variable = "aws:SecureTransport" + values = ["false"] + } + } +} + + +resource "aws_sqs_queue" "queued_builds" { + for_each = var.multi_runner_config + name = "${var.prefix}-${each.key}-queued-builds${each.value.fifo ? ".fifo" : ""}" + delay_seconds = each.value.runner_config.delay_webhook_event + visibility_timeout_seconds = var.runners_scale_up_lambda_timeout + message_retention_seconds = each.value.runner_config.job_queue_retention_in_seconds + fifo_queue = each.value.fifo + receive_wait_time_seconds = 0 + content_based_deduplication = each.value.fifo + redrive_policy = each.value.redrive_build_queue.enabled ? jsonencode({ + deadLetterTargetArn = aws_sqs_queue.queued_builds_dlq[each.key].arn, + maxReceiveCount = each.value.redrive_build_queue.maxReceiveCount + }) : null + + sqs_managed_sse_enabled = var.queue_encryption.sqs_managed_sse_enabled + kms_master_key_id = var.queue_encryption.kms_master_key_id + kms_data_key_reuse_period_seconds = var.queue_encryption.kms_data_key_reuse_period_seconds + + tags = var.tags +} + +resource "aws_sqs_queue_policy" "build_queue_policy" { + for_each = var.multi_runner_config + queue_url = aws_sqs_queue.queued_builds[each.key].id + policy = data.aws_iam_policy_document.deny_unsecure_transport.json +} + +resource "aws_sqs_queue" "queued_builds_dlq" { + for_each = { for config, values in var.multi_runner_config : config => values if values.redrive_build_queue.enabled } + name = "${var.prefix}-${each.key}-queued-builds_dead_letter${each.value.fifo ? ".fifo" : ""}" + + sqs_managed_sse_enabled = var.queue_encryption.sqs_managed_sse_enabled + kms_master_key_id = var.queue_encryption.kms_master_key_id + kms_data_key_reuse_period_seconds = var.queue_encryption.kms_data_key_reuse_period_seconds + fifo_queue = each.value.fifo + tags = var.tags +} + +resource "aws_sqs_queue_policy" "build_queue_dlq_policy" { + for_each = { for config, values in var.multi_runner_config : config => values if values.redrive_build_queue.enabled } + queue_url = aws_sqs_queue.queued_builds_dlq[each.key].id + policy = data.aws_iam_policy_document.deny_unsecure_transport.json +} + +resource "aws_sqs_queue" "webhook_events_workflow_job_queue" { + count = var.enable_workflow_job_events_queue ? 1 : 0 + name = "${var.prefix}-webhook_events_workflow_job_queue" + delay_seconds = var.workflow_job_queue_configuration.delay_seconds + visibility_timeout_seconds = var.workflow_job_queue_configuration.visibility_timeout_seconds + message_retention_seconds = var.workflow_job_queue_configuration.message_retention_seconds + fifo_queue = false + receive_wait_time_seconds = 0 + content_based_deduplication = false + redrive_policy = null + + sqs_managed_sse_enabled = var.queue_encryption.sqs_managed_sse_enabled + kms_master_key_id = var.queue_encryption.kms_master_key_id + kms_data_key_reuse_period_seconds = var.queue_encryption.kms_data_key_reuse_period_seconds + + tags = var.tags +} + diff --git a/github-runners/terraform/autoscaling/modules/multi-runner/runner-binaries.tf b/github-runners/terraform/autoscaling/modules/multi-runner/runner-binaries.tf new file mode 100644 index 00000000..64ff73aa --- /dev/null +++ b/github-runners/terraform/autoscaling/modules/multi-runner/runner-binaries.tf @@ -0,0 +1,44 @@ +module "runner_binaries" { + source = "../runner-binaries-syncer" + for_each = local.unique_os_and_arch + prefix = "${var.prefix}-${each.value.os_type}-${each.value.architecture}" + tags = local.tags + + # force mandatory lower case for s3 bucketname + distribution_bucket_name = lower("${var.prefix}-${each.value.os_type}-${each.value.architecture}-dist-${random_string.random.result}") + + runner_os = each.value.os_type + runner_architecture = each.value.architecture + + lambda_s3_bucket = var.lambda_s3_bucket + syncer_lambda_s3_key = var.syncer_lambda_s3_key + syncer_lambda_s3_object_version = var.syncer_lambda_s3_object_version + lambda_runtime = var.lambda_runtime + lambda_architecture = var.lambda_architecture + lambda_zip = var.runner_binaries_syncer_lambda_zip + lambda_memory_size = var.runner_binaries_syncer_memory_size + lambda_timeout = var.runner_binaries_syncer_lambda_timeout + tracing_config = var.tracing_config + logging_retention_in_days = var.logging_retention_in_days + logging_kms_key_id = var.logging_kms_key_id + state_event_rule_binaries_syncer = var.state_event_rule_binaries_syncer + + server_side_encryption_configuration = var.runner_binaries_s3_sse_configuration + s3_versioning = var.runner_binaries_s3_versioning + + role_path = var.role_path + role_permissions_boundary = var.role_permissions_boundary + + log_level = var.log_level + + lambda_subnet_ids = var.lambda_subnet_ids + lambda_security_group_ids = var.lambda_security_group_ids + aws_partition = var.aws_partition + + lambda_principals = var.lambda_principals +} +locals { + runner_binaries_by_os_and_arch_map = { + for k, v in module.runner_binaries : k => { arn = v.bucket.arn, id = v.bucket.id, key = v.runner_distribution_object_key } + } +} diff --git a/github-runners/terraform/autoscaling/modules/multi-runner/runners.tf b/github-runners/terraform/autoscaling/modules/multi-runner/runners.tf new file mode 100644 index 00000000..859e477e --- /dev/null +++ b/github-runners/terraform/autoscaling/modules/multi-runner/runners.tf @@ -0,0 +1,113 @@ +module "runners" { + source = "../runners" + for_each = local.runner_config + aws_region = var.aws_region + aws_partition = var.aws_partition + vpc_id = coalesce(each.value.runner_config.vpc_id, var.vpc_id) + subnet_ids = coalesce(each.value.runner_config.subnet_ids, var.subnet_ids) + prefix = "${var.prefix}-${each.key}" + tags = merge(local.tags, { + "ghr:environment" = "${var.prefix}-${each.key}" + }) + + s3_runner_binaries = each.value.runner_config.enable_runner_binaries_syncer ? local.runner_binaries_by_os_and_arch_map["${each.value.runner_config.runner_os}_${each.value.runner_config.runner_architecture}"] : null + + ssm_paths = { + root = "${local.ssm_root_path}/${each.key}" + tokens = "${var.ssm_paths.runners}/tokens" + config = "${var.ssm_paths.runners}/config" + } + + runner_os = each.value.runner_config.runner_os + instance_types = each.value.runner_config.instance_types + instance_target_capacity_type = each.value.runner_config.instance_target_capacity_type + instance_allocation_strategy = each.value.runner_config.instance_allocation_strategy + instance_max_spot_price = each.value.runner_config.instance_max_spot_price + block_device_mappings = each.value.runner_config.block_device_mappings + + runner_architecture = each.value.runner_config.runner_architecture + ami_filter = each.value.runner_config.ami_filter + ami_owners = each.value.runner_config.ami_owners + ami_id_ssm_parameter_name = each.value.runner_config.ami_id_ssm_parameter_name + ami_kms_key_arn = each.value.runner_config.ami_kms_key_arn + + sqs_build_queue = { "arn" : each.value.arn } + github_app_parameters = local.github_app_parameters + ebs_optimized = each.value.runner_config.ebs_optimized + enable_on_demand_failover_for_errors = each.value.runner_config.enable_on_demand_failover_for_errors + enable_organization_runners = each.value.runner_config.enable_organization_runners + enable_ephemeral_runners = each.value.runner_config.enable_ephemeral_runners + enable_jit_config = each.value.runner_config.enable_jit_config + enable_job_queued_check = each.value.runner_config.enable_job_queued_check + disable_runner_autoupdate = each.value.runner_config.disable_runner_autoupdate + enable_managed_runner_security_group = var.enable_managed_runner_security_group + enable_runner_detailed_monitoring = each.value.runner_config.enable_runner_detailed_monitoring + scale_down_schedule_expression = each.value.runner_config.scale_down_schedule_expression + minimum_running_time_in_minutes = each.value.runner_config.minimum_running_time_in_minutes + runner_boot_time_in_minutes = each.value.runner_config.runner_boot_time_in_minutes + runner_labels = sort(distinct(concat(["self-hosted", each.value.runner_config.runner_os, each.value.runner_config.runner_architecture], each.value.runner_config.runner_extra_labels))) + runner_as_root = each.value.runner_config.runner_as_root + runner_run_as = each.value.runner_config.runner_run_as + runners_maximum_count = each.value.runner_config.runners_maximum_count + idle_config = each.value.runner_config.idle_config + enable_ssm_on_runners = each.value.runner_config.enable_ssm_on_runners + egress_rules = var.runner_egress_rules + runner_additional_security_group_ids = try(coalescelist(each.value.runner_config.runner_additional_security_group_ids, var.runner_additional_security_group_ids), []) + metadata_options = each.value.runner_config.runner_metadata_options + credit_specification = each.value.runner_config.credit_specification + + enable_runner_binaries_syncer = each.value.runner_config.enable_runner_binaries_syncer + lambda_s3_bucket = var.lambda_s3_bucket + runners_lambda_s3_key = var.runners_lambda_s3_key + runners_lambda_s3_object_version = var.runners_lambda_s3_object_version + lambda_runtime = var.lambda_runtime + lambda_architecture = var.lambda_architecture + lambda_zip = var.runners_lambda_zip + lambda_scale_up_memory_size = var.scale_up_lambda_memory_size + lambda_timeout_scale_up = var.runners_scale_up_lambda_timeout + lambda_scale_down_memory_size = var.scale_down_lambda_memory_size + lambda_timeout_scale_down = var.runners_scale_down_lambda_timeout + lambda_subnet_ids = var.lambda_subnet_ids + lambda_security_group_ids = var.lambda_security_group_ids + tracing_config = var.tracing_config + logging_retention_in_days = var.logging_retention_in_days + logging_kms_key_id = var.logging_kms_key_id + enable_cloudwatch_agent = each.value.runner_config.enable_cloudwatch_agent + cloudwatch_config = try(coalesce(each.value.runner_config.cloudwatch_config, var.cloudwatch_config), null) + runner_log_files = each.value.runner_config.runner_log_files + runner_group_name = each.value.runner_config.runner_group_name + runner_name_prefix = each.value.runner_config.runner_name_prefix + + scale_up_reserved_concurrent_executions = each.value.runner_config.scale_up_reserved_concurrent_executions + + instance_profile_path = var.instance_profile_path + role_path = var.role_path + role_permissions_boundary = var.role_permissions_boundary + + enable_userdata = each.value.runner_config.enable_userdata + userdata_template = each.value.runner_config.userdata_template + userdata_content = each.value.runner_config.userdata_content + userdata_pre_install = each.value.runner_config.userdata_pre_install + userdata_post_install = each.value.runner_config.userdata_post_install + key_name = var.key_name + runner_ec2_tags = each.value.runner_config.runner_ec2_tags + + create_service_linked_role_spot = each.value.runner_config.create_service_linked_role_spot + + runner_iam_role_managed_policy_arns = each.value.runner_config.runner_iam_role_managed_policy_arns + + ghes_url = var.ghes_url + ghes_ssl_verify = var.ghes_ssl_verify + + kms_key_arn = var.kms_key_arn + + log_level = var.log_level + + pool_config = each.value.runner_config.pool_config + pool_lambda_timeout = var.pool_lambda_timeout + pool_runner_owner = each.value.runner_config.pool_runner_owner + pool_lambda_reserved_concurrent_executions = var.pool_lambda_reserved_concurrent_executions + associate_public_ipv4_address = var.associate_public_ipv4_address + + ssm_housekeeper = var.runners_ssm_housekeeper +} diff --git a/github-runners/terraform/autoscaling/modules/multi-runner/ssm.tf b/github-runners/terraform/autoscaling/modules/multi-runner/ssm.tf new file mode 100644 index 00000000..6b2591f4 --- /dev/null +++ b/github-runners/terraform/autoscaling/modules/multi-runner/ssm.tf @@ -0,0 +1,8 @@ +module "ssm" { + source = "../ssm" + + kms_key_arn = var.kms_key_arn + path_prefix = "${local.ssm_root_path}/${var.ssm_paths.app}" + github_app = var.github_app + tags = local.tags +} diff --git a/github-runners/terraform/autoscaling/modules/multi-runner/termination-watcher.tf b/github-runners/terraform/autoscaling/modules/multi-runner/termination-watcher.tf new file mode 100644 index 00000000..c61063b1 --- /dev/null +++ b/github-runners/terraform/autoscaling/modules/multi-runner/termination-watcher.tf @@ -0,0 +1,27 @@ +locals { + lambda_instance_termination_watcher = { + prefix = var.prefix + tags = local.tags + aws_partition = var.aws_partition + architecture = var.lambda_architecture + principals = var.lambda_principals + runtime = var.lambda_runtime + security_group_ids = var.lambda_security_group_ids + subnet_ids = var.lambda_subnet_ids + log_level = var.log_level + logging_kms_key_id = var.logging_kms_key_id + logging_retention_in_days = var.logging_retention_in_days + role_path = var.role_path + role_permissions_boundary = var.role_permissions_boundary + metrics_namespace = var.metrics_namespace + s3_bucket = var.lambda_s3_bucket + tracing_config = var.tracing_config + } +} + +module "instance_termination_watcher" { + source = "../termination-watcher" + count = var.instance_termination_watcher.enable ? 1 : 0 + + config = merge(local.lambda_instance_termination_watcher, var.instance_termination_watcher) +} diff --git a/github-runners/terraform/autoscaling/modules/multi-runner/variables.ami-housekeepr.tf b/github-runners/terraform/autoscaling/modules/multi-runner/variables.ami-housekeepr.tf new file mode 100644 index 00000000..f92211b6 --- /dev/null +++ b/github-runners/terraform/autoscaling/modules/multi-runner/variables.ami-housekeepr.tf @@ -0,0 +1,58 @@ + +variable "enable_ami_housekeeper" { + description = "Option to disable the lambda to clean up old AMIs." + type = bool + default = false +} + +variable "ami_housekeeper_lambda_zip" { + description = "File location of the lambda zip file." + type = string + default = null +} + +variable "ami_housekeeper_lambda_memory_size" { + description = "Memory size linit in MB of the lambda." + type = number + default = 256 +} + +variable "ami_housekeeper_lambda_timeout" { + description = "Time out of the lambda in seconds." + type = number + default = 300 +} + +variable "ami_housekeeper_lambda_s3_key" { + description = "S3 key for syncer lambda function. Required if using S3 bucket to specify lambdas." + type = string + default = null +} + +variable "ami_housekeeper_lambda_s3_object_version" { + description = "S3 object version for syncer lambda function. Useful if S3 versioning is enabled on source bucket." + type = string + default = null +} + +variable "ami_housekeeper_lambda_schedule_expression" { + description = "Scheduler expression for action runner binary syncer." + type = string + default = "cron(11 7 * * ? *)" # once a day +} + +variable "ami_housekeeper_cleanup_config" { + description = "Configuration for AMI cleanup." + type = object({ + maxItems = optional(number) + minimumDaysOld = optional(number) + amiFilters = optional(list(object({ + Name = string + Values = list(string) + }))) + launchTemplateNames = optional(list(string)) + ssmParameterNames = optional(list(string)) + dryRun = optional(bool) + }) + default = {} +} diff --git a/github-runners/terraform/autoscaling/modules/multi-runner/variables.tf b/github-runners/terraform/autoscaling/modules/multi-runner/variables.tf new file mode 100644 index 00000000..33556a80 --- /dev/null +++ b/github-runners/terraform/autoscaling/modules/multi-runner/variables.tf @@ -0,0 +1,646 @@ +variable "github_app" { + description = "GitHub app parameters, see your github app. Ensure the key is the base64-encoded `.pem` file (the output of `base64 app.private-key.pem`, not the content of `private-key.pem`)." + type = object({ + key_base64 = string + id = string + webhook_secret = string + }) +} + +variable "prefix" { + description = "The prefix used for naming resources" + type = string + default = "github-actions" +} + +variable "kms_key_arn" { + description = "Optional CMK Key ARN to be used for Parameter Store." + type = string + default = null +} + +variable "tags" { + description = "Map of tags that will be added to created resources. By default resources will be tagged with name and environment." + type = map(string) + default = {} +} + +variable "multi_runner_config" { + type = map(object({ + runner_config = object({ + runner_os = string + runner_architecture = string + runner_metadata_options = optional(map(any), { + instance_metadata_tags = "enabled" + http_endpoint = "enabled" + http_tokens = "required" + http_put_response_hop_limit = 1 + }) + ami_filter = optional(map(list(string)), { state = ["available"] }) + ami_owners = optional(list(string), ["amazon"]) + ami_id_ssm_parameter_name = optional(string, null) + ami_kms_key_arn = optional(string, "") + create_service_linked_role_spot = optional(bool, false) + credit_specification = optional(string, null) + delay_webhook_event = optional(number, 30) + disable_runner_autoupdate = optional(bool, false) + ebs_optimized = optional(bool, false) + enable_ephemeral_runners = optional(bool, false) + enable_job_queued_check = optional(bool, null) + enable_on_demand_failover_for_errors = optional(list(string), []) + enable_organization_runners = optional(bool, false) + enable_runner_binaries_syncer = optional(bool, true) + enable_ssm_on_runners = optional(bool, false) + enable_userdata = optional(bool, true) + instance_allocation_strategy = optional(string, "lowest-price") + instance_max_spot_price = optional(string, null) + instance_target_capacity_type = optional(string, "spot") + instance_types = list(string) + job_queue_retention_in_seconds = optional(number, 86400) + minimum_running_time_in_minutes = optional(number, null) + pool_runner_owner = optional(string, null) + runner_as_root = optional(bool, false) + runner_boot_time_in_minutes = optional(number, 5) + runner_extra_labels = optional(list(string), []) + runner_group_name = optional(string, "Default") + runner_name_prefix = optional(string, "") + runner_run_as = optional(string, "ec2-user") + runners_maximum_count = number + runner_additional_security_group_ids = optional(list(string), []) + scale_down_schedule_expression = optional(string, "cron(*/5 * * * ? *)") + scale_up_reserved_concurrent_executions = optional(number, 1) + userdata_template = optional(string, null) + userdata_content = optional(string, null) + enable_jit_config = optional(bool, null) + enable_runner_detailed_monitoring = optional(bool, false) + enable_cloudwatch_agent = optional(bool, true) + cloudwatch_config = optional(string, null) + userdata_pre_install = optional(string, "") + userdata_post_install = optional(string, "") + runner_ec2_tags = optional(map(string), {}) + runner_iam_role_managed_policy_arns = optional(list(string), []) + vpc_id = optional(string, null) + subnet_ids = optional(list(string), null) + idle_config = optional(list(object({ + cron = string + timeZone = string + idleCount = number + evictionStrategy = optional(string, "oldest_first") + })), []) + runner_log_files = optional(list(object({ + log_group_name = string + prefix_log_group = bool + file_path = string + log_stream_name = string + })), null) + block_device_mappings = optional(list(object({ + delete_on_termination = optional(bool, true) + device_name = optional(string, "/dev/xvda") + encrypted = optional(bool, true) + iops = optional(number) + kms_key_id = optional(string) + snapshot_id = optional(string) + throughput = optional(number) + volume_size = number + volume_type = optional(string, "gp3") + })), [{ + volume_size = 30 + }]) + pool_config = optional(list(object({ + schedule_expression = string + size = number + })), []) + }) + + matcherConfig = object({ + labelMatchers = list(list(string)) + exactMatch = optional(bool, false) + priority = optional(number, 999) + }) + fifo = optional(bool, false) + redrive_build_queue = optional(object({ + enabled = bool + maxReceiveCount = number + }), { + enabled = false + maxReceiveCount = null + }) + })) + description = <
"delay_seconds": null,
"message_retention_seconds": null,
"visibility_timeout_seconds": null
}