diff --git a/IaC/main.tf b/IaC/main.tf index df350e7..1deb48c 100644 --- a/IaC/main.tf +++ b/IaC/main.tf @@ -1,7 +1,7 @@ terraform { backend "gcs" { } - required_version = "=0.12.29" + required_version = "~> 0.13.2" required_providers { google = "~> 3.13" } @@ -15,19 +15,23 @@ provider "google-beta" { project = var.project_id } +resource "random_id" "artifacts_bucket_name_suffix" { + byte_length = 5 +} module "network" { source = "./modules/network" - vpc_name = var.network_name + network_name = var.network_name } module "mlflow" { source = "./modules/mlflow" - artifacts_bucket_name = var.artifacts_bucket + artifacts_bucket_name = "${var.artifacts_bucket}-${random_id.artifacts_bucket_name_suffix.hex}" db_password_value = var.db_password_value - private_vpc_connection = module.network.private_vpc_connection - network_link = module.network.network_link server_docker_image = var.mlflow_docker_image project_id = var.project_id - vpc_connector = module.network.vpc_connector + consent_screen_support_email = var.consent_screen_support_email + web_app_users = var.web_app_users + network_self_link = module.network.network_self_link + network_short_name = module.network.network_short_name } diff --git a/IaC/modules/mlflow/database/main.tf b/IaC/modules/mlflow/database/main.tf index 79b4046..167e7e7 100644 --- a/IaC/modules/mlflow/database/main.tf +++ b/IaC/modules/mlflow/database/main.tf @@ -7,19 +7,16 @@ resource "google_sql_database_instance" "this_instance" { database_version = var.database_version region = var.region - depends_on = [var.private_vpc_connection] - settings { tier = var.size ip_configuration { ipv4_enabled = false - private_network = var.network_link + private_network = var.network_self_link } backup_configuration { enabled = true } availability_type = var.availability_type - } } diff --git a/IaC/modules/mlflow/database/variables.tf b/IaC/modules/mlflow/database/variables.tf index e77d728..6423cef 100644 --- a/IaC/modules/mlflow/database/variables.tf +++ b/IaC/modules/mlflow/database/variables.tf @@ -13,19 +13,11 @@ variable "region" { description = "Region of the database instance" default = "europe-west1" } -variable "private_vpc_connection" { - type = any - description = "Private connection used to connect your instance with" -} variable "size" { type = string description = "Size of the database instance" default = "db-f1-micro" } -variable "network_link" { - type = string - description = "Network link you want to connect your database with" -} variable "availability_type" { type = string description = "Availability of your instance" @@ -48,3 +40,7 @@ variable "module_depends_on" { type = any default = null } + +variable "network_self_link" { + type = string +} diff --git a/IaC/modules/mlflow/main.tf b/IaC/modules/mlflow/main.tf index 88344ab..da253f6 100644 --- a/IaC/modules/mlflow/main.tf +++ b/IaC/modules/mlflow/main.tf @@ -17,13 +17,12 @@ module "database" { instance_prefix = var.db_instance_prefix database_version = var.db_version region = var.db_region - private_vpc_connection = var.private_vpc_connection size = var.db_size - network_link = var.network_link availability_type = var.db_availability_type database_name = var.db_name username = var.db_username password = module.db_secret.secret_value + network_self_link = var.network_self_link } module "server" { @@ -32,13 +31,15 @@ module "server" { location = var.server_location docker_image_name = var.server_docker_image env_variables = var.server_env_variables - sql_instance_name = module.database.instance_connection_name db_private_ip = module.database.private_ip project_id = var.project_id db_password_name = var.db_password_name db_username = var.db_username db_name = var.db_name + db_instance = module.database.instance_connection_name gcs_backend = module.artifacts.url - vpc_connector = var.vpc_connector module_depends_on = var.module_depends_on + consent_screen_support_email = var.consent_screen_support_email + web_app_users = var.web_app_users + network_short_name = var.network_short_name } diff --git a/IaC/modules/mlflow/server/main.tf b/IaC/modules/mlflow/server/main.tf index b8b96c3..0928e97 100644 --- a/IaC/modules/mlflow/server/main.tf +++ b/IaC/modules/mlflow/server/main.tf @@ -10,85 +10,101 @@ locals { }, var.env_variables) } +data "google_project" "project" { +} -resource "google_service_account" "service_account_cloud_run" { - account_id = format("cloud-run-%s", var.server_name) - display_name = "Cloud run service account used" +resource "google_app_engine_application" "app" { + location_id = var.location + iap { + enabled = true + oauth2_client_id = google_iap_client.project_client.client_id + oauth2_client_secret = google_iap_client.project_client.secret + } } resource "google_project_iam_member" "cloudsql" { - project = google_service_account.service_account_cloud_run.project + project = data.google_project.project.project_id role = "roles/cloudsql.client" - member = format("serviceAccount:%s", google_service_account.service_account_cloud_run.email) + member = format("serviceAccount:%s@appspot.gserviceaccount.com", data.google_project.project.name) } resource "google_project_iam_member" "secret" { - project = google_service_account.service_account_cloud_run.project + project = data.google_project.project.project_id role = "roles/secretmanager.secretAccessor" - member = format("serviceAccount:%s", google_service_account.service_account_cloud_run.email) + member = format("serviceAccount:%s@appspot.gserviceaccount.com", data.google_project.project.name) } resource "google_project_iam_member" "gcs" { - project = google_service_account.service_account_cloud_run.project + project = data.google_project.project.project_id role = "roles/storage.objectAdmin" - member = format("serviceAccount:%s", google_service_account.service_account_cloud_run.email) + member = format("serviceAccount:service-%s@gae-api-prod.google.com.iam.gserviceaccount.com", data.google_project.project.number) } +resource "google_project_iam_member" "gae_api" { + project = data.google_project.project.project_id + role = "roles/compute.networkUser" + member = format("serviceAccount:%s@appspot.gserviceaccount.com", data.google_project.project.name) +} -resource "google_cloud_run_service" "default" { - name = var.server_name - location = var.location +resource "google_app_engine_flexible_app_version" "myapp_v1" { + service = var.service + version_id = "v1" + runtime = "custom" - template { - spec { - service_account_name = google_service_account.service_account_cloud_run.email - containers { - image = var.docker_image_name - dynamic "env" { - for_each = local.env_variables - content { - name = env.key - value = env.value - } - } - resources { - limits = { - cpu = var.cpu_limit - memory = var.memory_limit - } - } - } - } - metadata { - annotations = { - "run.googleapis.com/cloudsql-instances" = var.sql_instance_name - "run.googleapis.com/vpc-access-connector" = var.vpc_connector - } + deployment { + container { + image = var.docker_image_name } } - traffic { - percent = 100 - latest_revision = true + liveness_check { + path = "/" + } + + readiness_check { + path = "/" } - autogenerate_revision_name = true - depends_on = [google_project_iam_member.cloudsql, google_project_iam_member.secret, google_project_iam_member.gcs, var.module_depends_on] -} + env_variables = local.env_variables -data "google_iam_policy" "noauth" { - binding { - role = "roles/run.invoker" - members = [ - "allUsers", - ] + automatic_scaling { + cool_down_period = "120s" + max_total_instances = 1 + min_total_instances = 1 + cpu_utilization { + target_utilization = 0.5 + } + } + resources { + cpu = 1 + memory_gb = 2 + } + network { + name = var.network_short_name } -} -resource "google_cloud_run_service_iam_policy" "noauth" { - location = google_cloud_run_service.default.location - project = google_cloud_run_service.default.project - service = google_cloud_run_service.default.name + beta_settings = { + cloud_sql_instances = format("%s=tcp:3306", var.db_instance) + } - policy_data = data.google_iam_policy.noauth.policy_data + noop_on_destroy = true + depends_on = [google_project_iam_member.gcs, google_project_iam_member.cloudsql, google_project_iam_member.secret, google_project_iam_member.gae_api] +} + +resource "google_iap_brand" "project_brand" { + support_email = var.consent_screen_support_email + application_title = "mlflow" + project = data.google_project.project.number +} +resource "google_iap_client" "project_client" { + display_name = "mlflow" + brand = google_iap_brand.project_brand.name } +resource "google_iap_app_engine_service_iam_binding" "member" { + project = data.google_project.project.name + app_id = data.google_project.project.name + service = google_app_engine_flexible_app_version.myapp_v1.service + role = "roles/iap.httpsResourceAccessor" + members = var.web_app_users + depends_on = [google_app_engine_flexible_app_version.myapp_v1] +} \ No newline at end of file diff --git a/IaC/modules/mlflow/server/variables.tf b/IaC/modules/mlflow/server/variables.tf index 4a81b20..7fcf354 100644 --- a/IaC/modules/mlflow/server/variables.tf +++ b/IaC/modules/mlflow/server/variables.tf @@ -5,7 +5,7 @@ variable "server_name" { variable "location" { type = string description = "Location to deploy your server" - default = "europe-west1" + default = "europe-west" } variable "docker_image_name" { type = string @@ -15,10 +15,6 @@ variable "env_variables" { type = map description = "Env variable to be used in your container" } -variable "sql_instance_name" { - type = string - description = "Sql instance name your server needs access to" -} variable "project_id" { description = "GCP project" type = string @@ -35,34 +31,31 @@ variable "db_name" { description = "Name of the database" type = string } -variable "gcs_backend" { - description = "Gcs bucket used for artifacts" +variable "db_instance" { + description = "Name of the database instance" type = string } -variable "cpu_limit" { - type = string - description = "Maximum cpu" - default = "1000m" -} -variable "memory_limit" { - type = string - description = "Memory limit of your container" - default = "1024Mi" -} -variable "vpc_connector" { +variable "gcs_backend" { + description = "Gcs bucket used for artifacts" type = string - description = "Vpc connector of your private network" } variable "db_private_ip" { type = string description = "Private ip of the db" } variable "module_depends_on" { - type = any - default = null + type = any + default = null } -variable "service_account_mlflow_users" { +variable "consent_screen_support_email" { type = string - default = "mlflow-users" - description = "Service account created to connect to mlflow" -} \ No newline at end of file + description = "Person or group to contact in case of problem" +} +variable "web_app_users" { + type = list(string) + description = "List of people who can acess the mlflow web app. e.g. [user:jane@example.com, group:people@example.com]" +} +variable "service" { + default = "default" +} +variable "network_short_name" {} diff --git a/IaC/modules/mlflow/variables.tf b/IaC/modules/mlflow/variables.tf index 122dda8..6f98dc9 100644 --- a/IaC/modules/mlflow/variables.tf +++ b/IaC/modules/mlflow/variables.tf @@ -46,19 +46,11 @@ variable "db_region" { type = string default = "europe-west1" } -variable "private_vpc_connection" { - description = "Vpc connection with the database" - type = any -} variable "db_size" { description = "Database instance size" type = string default = "db-f1-micro" } -variable "network_link" { - description = "Link to your network" - type = string -} variable "db_availability_type" { description = "Availability of your database" type = string @@ -77,7 +69,7 @@ variable "mlflow_server" { variable "server_location" { description = "Location to deploy cloud run server" type = string - default = "europe-west1" + default = "europe-west" } variable "server_docker_image" { description = "Docker image name of your mlflow server" @@ -92,11 +84,19 @@ variable "project_id" { description = "GCP project" type = string } -variable "vpc_connector" { - type = string - description = "Vpc connector of your private network" -} variable "module_depends_on" { type = any default = null } +variable "consent_screen_support_email" { + type = string + description = "Person or group to contact in case of problem" +} +variable "web_app_users" { + type = list(string) + description = "List of people who can acess the mlflow web app. e.g. [user:jane@example.com, group:people@example.com]" +} +variable "network_self_link" { + type = string +} +variable "network_short_name" {} \ No newline at end of file diff --git a/IaC/modules/network/main.tf b/IaC/modules/network/main.tf index 4ed1a9c..f109a28 100644 --- a/IaC/modules/network/main.tf +++ b/IaC/modules/network/main.tf @@ -1,28 +1,46 @@ -resource "google_compute_network" "vpc" { - name = var.vpc_name - routing_mode = "GLOBAL" +resource "google_compute_network" "private_network" { + count = length(var.network_name) > 0 ? 0 : 1 + name = var.network_name_local auto_create_subnetworks = true } -resource "google_compute_global_address" "private_ip_address" { - provider = google-beta +resource "google_compute_firewall" "allow-internal" { + count = length(var.network_name) > 0 ? 0 : 1 + name = "${var.network_name_local}-allow-internal" + network = google_compute_network.private_network[0].name - name = "private-ip-address" + allow { + protocol = "icmp" + } + allow { + protocol = "tcp" + ports = ["0-65535"] + } + allow { + protocol = "udp" + ports = ["0-65535"] + } + + # Default internal range where our priveta network subnetworks are deployed. + # c.f. https://www.terraform.io/docs/providers/google/r/compute_network.html#auto_create_subnetworks + source_ranges = ["10.128.0.0/9"] +} + +resource "google_compute_global_address" "private_ip_addresses" { + name = "private-ip-addresses" purpose = "VPC_PEERING" address_type = "INTERNAL" prefix_length = 16 - network = google_compute_network.vpc.self_link + network = length(var.network_name) > 0 ? var.network_name : google_compute_network.private_network[0].name } -resource "google_service_networking_connection" "private_vpc_connection" { - network = google_compute_network.vpc.self_link +resource "google_service_networking_connection" "peering_connection" { + network = length(var.network_name) > 0 ? var.network_name : google_compute_network.private_network[0].name service = "servicenetworking.googleapis.com" - reserved_peering_ranges = [google_compute_global_address.private_ip_address.name] + reserved_peering_ranges = [google_compute_global_address.private_ip_addresses.name] } -resource "google_vpc_access_connector" "vpc_con" { - name = "vpc-con" - region = var.region - ip_cidr_range = lookup(var.vpc_connector_regions, var.region) - network = google_compute_network.vpc.name -} +data "google_compute_network" "default_network" { + name = length(var.network_name) > 0 ? var.network_name : google_compute_network.private_network[0].name + depends_on = [google_service_networking_connection.peering_connection] +} \ No newline at end of file diff --git a/IaC/modules/network/output.tf b/IaC/modules/network/output.tf new file mode 100644 index 0000000..4d9a7c9 --- /dev/null +++ b/IaC/modules/network/output.tf @@ -0,0 +1,7 @@ +output "network_self_link" { + value = data.google_compute_network.default_network.self_link +} + +output "network_short_name" { + value = data.google_compute_network.default_network.name +} \ No newline at end of file diff --git a/IaC/modules/network/outputs.tf b/IaC/modules/network/outputs.tf deleted file mode 100644 index bf9dedc..0000000 --- a/IaC/modules/network/outputs.tf +++ /dev/null @@ -1,12 +0,0 @@ -output "network_link" { - description = "Link of the created network" - value = google_compute_network.vpc.self_link -} -output "private_vpc_connection" { - description = "Private vpc connection to servicenetworking" - value = google_service_networking_connection.private_vpc_connection -} -output "vpc_connector" { - description = "Connector to your private network" - value = google_vpc_access_connector.vpc_con.self_link -} diff --git a/IaC/modules/network/variables.tf b/IaC/modules/network/variables.tf index fce4fee..e84a075 100644 --- a/IaC/modules/network/variables.tf +++ b/IaC/modules/network/variables.tf @@ -1,16 +1,11 @@ -variable "vpc_name" { - type = string - description = "Name of the network you want to create" +variable "network_name" { + type = string + description = "Name of the network to attach to. If empty, a new network will be created" } -variable "region" { + +variable "network_name_local" { type = string - description = "Region to deploy your vpc connector" - default = "europe-west1" + description = "Name of the network to create if network_name does not exist already" + default = "mlflow-network" } -variable "vpc_connector_regions" { - type = map - description = "Regions where the VPC Access connector resides and the matching ip cidr range" - default = { - "europe-west1" = "10.8.0.0/28" - } -} \ No newline at end of file + diff --git a/IaC/modules/services/main.tf b/IaC/modules/services/main.tf index d817bbe..7891065 100644 --- a/IaC/modules/services/main.tf +++ b/IaC/modules/services/main.tf @@ -2,5 +2,5 @@ resource "google_project_service" "project" { count = length(var.services) project = var.project_id service = var.services[count.index] - disable_dependent_services = true + disable_dependent_services = false } diff --git a/IaC/prerequesites/main.tf b/IaC/prerequesites/main.tf index 60102b4..299d188 100644 --- a/IaC/prerequesites/main.tf +++ b/IaC/prerequesites/main.tf @@ -1,5 +1,5 @@ terraform { - required_version = "=0.12.29" + required_version = "~> 0.13.2" required_providers { google = "~> 3.13" } @@ -12,9 +12,16 @@ provider "google" { module "services" { source = "./../modules/services" project_id = var.project_id - services = ["container.googleapis.com", "servicenetworking.googleapis.com", - "stackdriver.googleapis.com", "vpcaccess.googleapis.com", "run.googleapis.com", - "sqladmin.googleapis.com", "secretmanager.googleapis.com"] + services = [ + "container.googleapis.com", + "servicenetworking.googleapis.com", + "stackdriver.googleapis.com", + "appengine.googleapis.com", + "appengineflex.googleapis.com", + "sqladmin.googleapis.com", + "secretmanager.googleapis.com", + "iap.googleapis.com" + ] } module "bucket_backend" { diff --git a/IaC/variables.tf b/IaC/variables.tf index 7acd7f1..bfce900 100644 --- a/IaC/variables.tf +++ b/IaC/variables.tf @@ -15,8 +15,15 @@ variable "mlflow_docker_image" { description = "Docker image used in container registry" type = string } -variable "network_name" { - description = "Network used" +variable "consent_screen_support_email" { type = string - default = "default-private" + description = "Person or group to contact in case of problem (address shown in the OAuth consent screen)" +} +variable "web_app_users" { + type = list(string) + description = "List of people who can acess the mlflow web app. e.g. [user:jane@example.com, group:people@example.com]" } +variable "network_name" { + type = string + description = "Name of the network to attach to. If empty, a new network will be created" +} \ No newline at end of file diff --git a/Makefile b/Makefile index 0b8db72..2fc5ccf 100644 --- a/Makefile +++ b/Makefile @@ -1,28 +1,23 @@ -DOCKER_REPO := eu.gcr.io -DOCKER_NAME := mlflow -DOCKER_TAG := 0.1 - - pre-requesites: - @cd Iac/prerequesites && terraform init && terraform apply -var="project_id=$(PROJECT_ID)" + source vars_base && cd Iac/prerequesites && terraform init && terraform apply build-docker: - @cd tracking_server && docker build -t $(DOCKER_REPO)/$(PROJECT_ID)/$(DOCKER_NAME):$(DOCKER_TAG) -f tracking.Dockerfile . + source vars_base && cd tracking_server && docker build -t $${TF_VAR_mlflow_docker_image} -f tracking.Dockerfile . push-docker: - @docker push $(DOCKER_REPO)/$(PROJECT_ID)/$(DOCKER_NAME):$(DOCKER_TAG) + source vars_base && docker push $${TF_VAR_mlflow_docker_image} init-terraform: - @cd Iac && terraform init -backend-config="bucket=$(BACKEND_TERRAFORM)" + source vars_base && cd Iac && terraform init -backend-config="bucket=$${TF_VAR_backend_bucket}" apply-terraform: - @cd Iac && terraform apply -var="project_id=$(PROJECT_ID)" -var="mlflow_docker_image=$(DOCKER_REPO)/$(PROJECT_ID)/$(DOCKER_NAME):$(DOCKER_TAG)" + source vars_base && cd Iac && terraform apply plan-terraform: - @cd Iac && terraform plan -var="project_id=$(PROJECT_ID)" -var="mlflow_docker_image=$(DOCKER_REPO)/$(PROJECT_ID)/$(DOCKER_NAME):$(DOCKER_TAG)" + source vars_base && cd Iac && terraform plan destroy-terraform: - @cd Iac && terraform destroy -var="project_id=$(PROJECT_ID)" -var="mlflow_docker_image=$(DOCKER_REPO)/$(PROJECT_ID)/$(DOCKER_NAME):$(DOCKER_TAG)" + source vars_base && cd Iac && terraform destroy apply: init-terraform apply-terraform @@ -31,3 +26,9 @@ plan: init-terraform plan-terraform destroy: init-terraform destroy-terraform docker: build-docker push-docker + +init: pre-requesites + +deploy: docker apply + +one-click-mlflow : init deploy diff --git a/README.md b/README.md index 70623dc..4b0c071 100644 --- a/README.md +++ b/README.md @@ -4,25 +4,25 @@ A tool to deploy a mostly serverless MLflow on a GCP project with one command ## How to use ### Pre-requesites -- Create a GCP project -- Init gcloud -- Have docker installed locally -- Export the name of your GCP project by running `EXPORT PROJECT_ID=YOUR_PROJECT` -- Run `make pre-requesites`. You'll be asked to enter the bucket name used to store terraform state, and your project id -- Deploy mlflow container by running `make docker`. This will build mlflow docker image locally and push it to container registry +- A GCP project +- Initialized gcloud SDK +- Docker engine running -### Deploy mlflow -- Export the name of your GCP project by running `EXPORT PROJECT_ID=YOUR_PROJECT` -- Export the name of your bucket used to store terraform state by running `EXPORT BACKEND_TERRAFORM=BUCKET_NAME`. It must be the same as the one you selected when installing the pre-requesites. -- Deploy mlflow to GCP by running `make apply`. You'll be asked to enter the database password you want to use +### Deploying +Fill out the `vars` file. +|Variable name|Description| +|---|---| +|`TF_VAR_project_id`|Name of the GCP project| +|`TF_VAR_backend_bucket`|Name of the terraform backend bucket. Should be unique. No `gs://` prefix| +|`TF_VAR_consent_screen_support_email`|Contact email address displayed by the SSO screen when the user trying to log in is not authorized. The address should be that of the user deploying mlflow (you) or a Cloud Identity group managed by this user| +|`TF_VAR_web_app_users`|List of authorized users/groups/domains. Should be a single quoted list of string such as '["user:jane@example.com", "group:people@example.com", "domain:example.com"]'| +|`TF_VAR_network_name`|The network the application and backend should attach to. If blank, a new network will be created.| -## Goals +Run `make one-clic-mlflow` and follow the prompts. -The project's deliverables are -- MLflow tracking server on Cloud Run -- Artifacts on GCS -- Metrics backend on Cloud SQL (MySQL) -- Terraformed infrastructure -- A list of all the GCP APIs that need to be enabled -- A list of all the necessary GCP permissions to run the deployment +### Other available make commands +- `make deploy`: builds and pushes the application image and (re)deploys the infrastructure +- `make docker`: builds and pushes the application image +- `make apply`: (re)deploys the infrastructure +- `make destroy`: destroys the infrastructure. **Will not delete the OAuth consent screen, and the app engine application** diff --git a/tracking_server/run_tracking.sh b/tracking_server/run_tracking.sh index 5d432f0..c2b33be 100644 --- a/tracking_server/run_tracking.sh +++ b/tracking_server/run_tracking.sh @@ -1,7 +1,7 @@ #!/bin/bash DB_PASSWORD=$(gcloud beta secrets versions access --project=${GCP_PROJECT} --secret=${DB_PASSWORD_NAME} latest) -BACKEND_URI=mysql+pymysql://${DB_USERNAME}:${DB_PASSWORD}@${DB_PRIVATE_IP}/${DB_NAME} +BACKEND_URI=mysql+pymysql://${DB_USERNAME}:${DB_PASSWORD}@${DB_PRIVATE_IP}:3306/${DB_NAME} mlflow db upgrade ${BACKEND_URI} @@ -9,4 +9,4 @@ mlflow server \ --backend-store-uri ${BACKEND_URI} \ --default-artifact-root ${GCS_BACKEND} \ --host 0.0.0.0 \ - --port $PORT + --port ${PORT} diff --git a/vars b/vars new file mode 100644 index 0000000..43044fd --- /dev/null +++ b/vars @@ -0,0 +1,5 @@ +export TF_VAR_project_id=two-click-mlflow +export TF_VAR_backend_bucket=tfstate-mlflow-atf-two +export TF_VAR_consent_screen_support_email=alexis.vialaret@artefact.com +export TF_VAR_web_app_users='["user:alexis.vialaret@artefact.com", "user:thomas.griseau@artefact.com"]' +export TF_VAR_network_name=default \ No newline at end of file diff --git a/vars_base b/vars_base new file mode 100644 index 0000000..5f8133f --- /dev/null +++ b/vars_base @@ -0,0 +1,5 @@ +source vars +export DOCKER_REPO=eu.gcr.io +export DOCKER_NAME=mlflow +export DOCKER_TAG=0.1 +export TF_VAR_mlflow_docker_image=$DOCKER_REPO/$TF_VAR_project_id/$DOCKER_NAME:$DOCKER_TAG \ No newline at end of file