From b7950bfd167b3a445ac43b42cf4d39e5e7ccc5c3 Mon Sep 17 00:00:00 2001 From: Paul Yang Date: Fri, 4 Aug 2023 13:34:53 -0700 Subject: [PATCH] Add Github workflow to populate the persistent source schema. --- ...ine-populate-persistent-source-schema.yaml | 115 ++++++++++++++++++ GLOSSARY.md | 13 ++ Makefile | 23 ++++ 3 files changed, 151 insertions(+) create mode 100644 .github/workflows/cd-sql-engine-populate-persistent-source-schema.yaml create mode 100644 GLOSSARY.md diff --git a/.github/workflows/cd-sql-engine-populate-persistent-source-schema.yaml b/.github/workflows/cd-sql-engine-populate-persistent-source-schema.yaml new file mode 100644 index 0000000000..2a7075cf48 --- /dev/null +++ b/.github/workflows/cd-sql-engine-populate-persistent-source-schema.yaml @@ -0,0 +1,115 @@ +# See [Persistent Source Schema](/GLOSSARY.md#persistent-source-schema) +# Populating the source schema via this workflow ensures that it's done with the same settings as the tests. + +name: Reload SQL Engine Test Data + +# We don't want multiple workflows trying to create the same table. +concurrency: + group: POPULATE_PERSISTENT_SOURCE_SCHEMA + cancel-in-progress: true + +on: + push: + branches: + - main + pull_request: + types: [labeled] + +env: + PYTHON_VERSION: "3.8" + ADDITIONAL_PYTEST_OPTIONS: "--use-persistent-source-schema" + +jobs: + snowflake-populate: + environment: DW_INTEGRATION_TESTS + if: ${{ github.event.action != 'labeled' || github.event.label.name == 'populate_persistent_source_schema' }} + name: Snowflake + runs-on: ubuntu-latest + steps: + - name: Check-out the repo + uses: actions/checkout@v3 + + - name: Populate w/Python ${{ env.PYTHON_VERSION }} + uses: ./.github/actions/run-mf-tests + with: + python-version: ${{ env.PYTHON_VERSION }} + mf_sql_engine_url: ${{ secrets.MF_SNOWFLAKE_URL }} + mf_sql_engine_password: ${{ secrets.MF_SNOWFLAKE_PWD }} + parallelism: 1 + additional-pytest-options: ${{ env.ADDITIONAL_PYTEST_OPTIONS }} + make-target: "populate-persistent-source-schema-snowflake" + +# redshift-populate: +# environment: DW_INTEGRATION_TESTS +# name: Redshift +# if: ${{ github.event.action != 'labeled' || github.event.label.name == 'populate_persistent_source_schema' }} +# runs-on: ubuntu-latest +# steps: +# - name: Check-out the repo +# uses: actions/checkout@v3 +# +# - name: Populate w/Python ${{ env.PYTHON_VERSION }} +# uses: ./.github/actions/run-mf-tests +# with: +# python-version: ${{ env.PYTHON_VERSION }} +# mf_sql_engine_url: ${{ secrets.MF_REDSHIFT_URL }} +# mf_sql_engine_password: ${{ secrets.MF_REDSHIFT_PWD }} +# parallelism: 1 +# additional-pytest-options: ${{ env.ADDITIONAL_PYTEST_OPTIONS }} +# make-target: "populate-persistent-source-schema-redshift" +# bigquery-populate: +# environment: DW_INTEGRATION_TESTS +# name: BigQuery +# if: ${{ github.event.action != 'labeled' || github.event.label.name == 'populate_persistent_source_schema' }} +# runs-on: ubuntu-latest +# steps: +# - name: Check-out the repo +# uses: actions/checkout@v3 +# +# - name: Populate w/Python ${{ env.PYTHON_VERSION }} +# uses: ./.github/actions/run-mf-tests +# with: +# python-version: ${{ env.PYTHON_VERSION }} +# MF_SQL_ENGINE_URL: ${{ secrets.MF_BIGQUERY_URL }} +# MF_SQL_ENGINE_PASSWORD: ${{ secrets.MF_BIGQUERY_PWD }} +# parallelism: 1 +# additional-pytest-options: ${{ env.ADDITIONAL_PYTEST_OPTIONS }} +# make-target: "populate-persistent-source-schema-bigquery" +# +# databricks-cluster-populate: +# environment: DW_INTEGRATION_TESTS +# name: Databricks Cluster +# if: ${{ github.event.action != 'labeled' || github.event.label.name == 'populate_persistent_source_schema' }} +# runs-on: ubuntu-latest +# steps: +# - name: Check-out the repo +# uses: actions/checkout@v3 +# +# - name: Populate w/Python Python ${{ env.PYTHON_VERSION }} +# uses: ./.github/actions/run-mf-tests +# with: +# python-version: ${{ env.PYTHON_VERSION }} +# mf_sql_engine_url: ${{ secrets.MF_DATABRICKS_CLUSTER_URL }} +# mf_sql_engine_password: ${{ secrets.MF_DATABRICKS_PWD }} +# parallelism: 1 +# additional-pytest-options: ${{ env.ADDITIONAL_PYTEST_OPTIONS }} +# make-target: "populate-persistent-source-schema-databricks" +# +# databricks-sql-warehouse-populate: +# environment: DW_INTEGRATION_TESTS +# name: Databricks SQL Warehouse +# if: ${{ github.event.action != 'labeled' || github.event.label.name == 'populate_persistent_source_schema' }} +# runs-on: ubuntu-latest +# steps: +# - name: Check-out the repo +# uses: actions/checkout@v3 +# +# - name: Populate w/Python ${{ env.PYTHON_VERSION }} +# uses: ./.github/actions/run-mf-tests +# with: +# python-version: ${{ env.PYTHON_VERSION }} +# mf_sql_engine_url: ${{ secrets.MF_DATABRICKS_SQL_WAREHOUSE_URL }} +# mf_sql_engine_password: ${{ secrets.MF_DATABRICKS_PWD }} +# parallelism: 1 +# additional-pytest-options: ${{ env.ADDITIONAL_PYTEST_OPTIONS }} +# make-target: "populate-persistent-source-schema-databricks" diff --git a/GLOSSARY.md b/GLOSSARY.md new file mode 100644 index 0000000000..7d7e5516da --- /dev/null +++ b/GLOSSARY.md @@ -0,0 +1,13 @@ +# Glossary + +## Persistent source schema +Many tests generate and execute SQL that depend on tables containing test data. By default, a +pytest fixture creates a temporary schema and populates it with the tables that are required by +the tests. This schema is referred to the source schema. Creating the source schema (and +the associated tables) can be a slow process for some SQL engines. Since these tables generally +do not change often, functionality was added to use a source schema that is assumed to already +exist when running tests and persists between runs (a persistent source schema). In addition, +functionality was added to create the persistent source schema based on table definitions in the +repo. Because the name of the source schema is generated based on the hash of the data that's +supposed to be in the schema, the creating and populating the persistent source schema should +not be done concurrently as there are race conditions when creating tables and inserting data. diff --git a/Makefile b/Makefile index bc2d10b863..2e1837e95d 100644 --- a/Makefile +++ b/Makefile @@ -7,6 +7,9 @@ PARALLELISM = "auto" # Additional command line options to pass to pytest. ADDITIONAL_PYTEST_OPTIONS = "" +# Pytest that can populate the persistent source schema +POPULATE_PERSISTENT_SOURCE_SCHEMA = metricflow/test/source_schema_tools.py::populate_source_schema + # Install Hatch package / project manager .PHONY: install-hatch install-hatch: @@ -23,22 +26,42 @@ test-postgresql: # Engine-specific test environments. In most cases you should run these with # `make -e ADDITIONAL_PYTEST_OPTIONS="--use-persistent-source-schema" test-` +# +# See: [Persistent Source Schema](/GLOSSARY.md#persistent-source-schema) .PHONY: test-bigquery test-bigquery: hatch -v run bigquery-env:pytest -vv -n $(PARALLELISM) $(ADDITIONAL_PYTEST_OPTIONS) metricflow/test/ +.PHONY: populate-persistent-source-schema-bigquery +populate-persistent-source-schema: + hatch -v run bigquery-env:pytest -vv $(ADDITIONAL_PYTEST_OPTIONS) $(POPULATE_PERSISTENT_SOURCE_SCHEMA) + .PHONY: test-databricks test-databricks: hatch -v run databricks-env:pytest -vv -n $(PARALLELISM) $(ADDITIONAL_PYTEST_OPTIONS) metricflow/test/ +.PHONY: populate-persistent-source-schema-databricks +populate-persistent-source-schema-databricks: + hatch -v run databricks-env:pytest -vv $(ADDITIONAL_PYTEST_OPTIONS) $(POPULATE_PERSISTENT_SOURCE_SCHEMA) + .PHONY: test-redshift test-redshift: hatch -v run redshift-env:pytest -vv -n $(PARALLELISM) $(ADDITIONAL_PYTEST_OPTIONS) metricflow/test/ +.PHONY: populate-persistent-source-schema-redshift +populate-persistent-source-schema-redshift: + hatch -v run redshift-env:pytest -vv $(ADDITIONAL_PYTEST_OPTIONS) $(POPULATE_PERSISTENT_SOURCE_SCHEMA) + + .PHONY: test-snowflake test-snowflake: hatch -v run snowflake-env:pytest -vv -n $(PARALLELISM) $(ADDITIONAL_PYTEST_OPTIONS) metricflow/test/ +.PHONY: populate-persistent-source-schema-snowflake +populate-persistent-source-schema-snowflake: + hatch -v run snowflake-env:pytest -vv $(ADDITIONAL_PYTEST_OPTIONS) $(POPULATE_PERSISTENT_SOURCE_SCHEMA) + + .PHONY: lint lint: hatch -v run dev-env:pre-commit run --all-files