From 609c08074962a783fc87d3a01c6192c93c515ba1 Mon Sep 17 00:00:00 2001 From: Paul Yang Date: Fri, 4 Aug 2023 13:34:53 -0700 Subject: [PATCH] Add Github workflow to populate the persistent source schema. --- ...ine-populate-persistent-source-schema.yaml | 110 ++++++++++++++++++ GLOSSARY.md | 13 +++ Makefile | 25 +++- 3 files changed, 146 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/cd-sql-engine-populate-persistent-source-schema.yaml create mode 100644 GLOSSARY.md diff --git a/.github/workflows/cd-sql-engine-populate-persistent-source-schema.yaml b/.github/workflows/cd-sql-engine-populate-persistent-source-schema.yaml new file mode 100644 index 0000000000..e62b12b16d --- /dev/null +++ b/.github/workflows/cd-sql-engine-populate-persistent-source-schema.yaml @@ -0,0 +1,110 @@ +# See [Persistent Source Schema](/GLOSSARY.md#persistent-source-schema) +# Populating the source schema via this workflow ensures that it's done with the same settings as the tests. + +name: Reload Test Data in SQL Engines + +# We don't want multiple workflows trying to create the same table. +concurrency: + group: POPULATE_PERSISTENT_SOURCE_SCHEMA + cancel-in-progress: true + +on: + pull_request: + types: [labeled] + workflow_dispatch: + +env: + # Unclear on how to make 'Reload Test Data in SQL Engines' a constant here as it does not work here. + PYTHON_VERSION: "3.8" + +jobs: + snowflake-populate: + environment: DW_INTEGRATION_TESTS + if: > + github.event.action == 'workflow_dispatch' + || (github.event.action == 'labeled' && github.event.label.name == 'Reload Test Data in SQL Engines') + name: Snowflake + runs-on: ubuntu-latest + steps: + - name: Check-out the repo + uses: actions/checkout@v3 + + - name: Populate w/Python ${{ env.PYTHON_VERSION }} + uses: ./.github/actions/run-mf-tests + with: + python-version: ${{ env.PYTHON_VERSION }} + mf_sql_engine_url: ${{ secrets.MF_SNOWFLAKE_URL }} + mf_sql_engine_password: ${{ secrets.MF_SNOWFLAKE_PWD }} + parallelism: 1 + make-target: "populate-persistent-source-schema-snowflake" + + redshift-populate: + environment: DW_INTEGRATION_TESTS + name: Redshift + if: > + github.event.action == 'workflow_dispatch' + || (github.event.action == 'labeled' && github.event.label.name == 'Reload Test Data in SQL Engines') + runs-on: ubuntu-latest + steps: + - name: Check-out the repo + uses: actions/checkout@v3 + + - name: Populate w/Python ${{ env.PYTHON_VERSION }} + uses: ./.github/actions/run-mf-tests + with: + python-version: ${{ env.PYTHON_VERSION }} + mf_sql_engine_url: ${{ secrets.MF_REDSHIFT_URL }} + mf_sql_engine_password: ${{ secrets.MF_REDSHIFT_PWD }} + parallelism: 1 + make-target: "populate-persistent-source-schema-redshift" + + bigquery-populate: + environment: DW_INTEGRATION_TESTS + name: BigQuery + if: > + github.event.action == 'workflow_dispatch' + || (github.event.action == 'labeled' && github.event.label.name == 'Reload Test Data in SQL Engines') + runs-on: ubuntu-latest + steps: + - name: Check-out the repo + uses: actions/checkout@v3 + + - name: Populate w/Python ${{ env.PYTHON_VERSION }} + uses: ./.github/actions/run-mf-tests + with: + python-version: ${{ env.PYTHON_VERSION }} + MF_SQL_ENGINE_URL: ${{ secrets.MF_BIGQUERY_URL }} + MF_SQL_ENGINE_PASSWORD: ${{ secrets.MF_BIGQUERY_PWD }} + parallelism: 1 + make-target: "populate-persistent-source-schema-bigquery" + + databricks-populate: + environment: DW_INTEGRATION_TESTS + name: Databricks SQL Warehouse + if: > + github.event.action == 'workflow_dispatch' + || (github.event.action == 'labeled' && github.event.label.name == 'Reload Test Data in SQL Engines') + runs-on: ubuntu-latest + steps: + - name: Check-out the repo + uses: actions/checkout@v3 + + - name: Populate w/Python ${{ env.PYTHON_VERSION }} + uses: ./.github/actions/run-mf-tests + with: + python-version: ${{ env.PYTHON_VERSION }} + mf_sql_engine_url: ${{ secrets.MF_DATABRICKS_SQL_WAREHOUSE_URL }} + mf_sql_engine_password: ${{ secrets.MF_DATABRICKS_PWD }} + parallelism: 1 + make-target: "populate-persistent-source-schema-databricks" + + remove-label: + name: Remove Label After Populating Test Data + runs-on: ubuntu-latest + needs: [ snowflake-populate, redshift-populate, bigquery-populate, databricks-populate] + if: github.event.action == 'labeled' && github.event.label.name == 'Reload Test Data in SQL Engines' + steps: + - name: Remove Label + uses: actions-ecosystem/action-remove-labels@v1 + with: + labels: 'Reload Test Data in SQL Engines' diff --git a/GLOSSARY.md b/GLOSSARY.md new file mode 100644 index 0000000000..ebb047bef8 --- /dev/null +++ b/GLOSSARY.md @@ -0,0 +1,13 @@ +# Glossary + +## Persistent source schema +Many tests generate and execute SQL that depend on tables containing test data. By default, a +pytest fixture creates a temporary schema and populates it with the tables that are required by +the tests. This schema is referred to the source schema. Creating the source schema (and +the associated tables) can be a slow process for some SQL engines. Since these tables generally +do not change often, functionality was added to use a source schema that is assumed to already +exist when running tests and persists between runs (a persistent source schema). In addition, +functionality was added to create the persistent source schema based on table definitions in the +repo. Because the name of the source schema is generated based on the hash of the data that's +supposed to be in the schema, the creating and populating the persistent source schema should +not be done concurrently as there are race conditions when creating tables and inserting data. diff --git a/Makefile b/Makefile index bc2d10b863..a0dc3152bd 100644 --- a/Makefile +++ b/Makefile @@ -7,6 +7,10 @@ PARALLELISM = "auto" # Additional command line options to pass to pytest. ADDITIONAL_PYTEST_OPTIONS = "" +# Pytest that can populate the persistent source schema +USE_PERSISTENT_SOURCE_SCHEMA = "--use-persistent-source-schema" +POPULATE_PERSISTENT_SOURCE_SCHEMA = "metricflow/test/source_schema_tools.py::populate_source_schema" + # Install Hatch package / project manager .PHONY: install-hatch install-hatch: @@ -21,24 +25,41 @@ test: test-postgresql: hatch -v run postgres-env:pytest -vv -n $(PARALLELISM) $(ADDITIONAL_PYTEST_OPTIONS) metricflow/test/ -# Engine-specific test environments. In most cases you should run these with -# `make -e ADDITIONAL_PYTEST_OPTIONS="--use-persistent-source-schema" test-` +# Engine-specific test environments. .PHONY: test-bigquery test-bigquery: hatch -v run bigquery-env:pytest -vv -n $(PARALLELISM) $(ADDITIONAL_PYTEST_OPTIONS) metricflow/test/ +.PHONY: populate-persistent-source-schema-bigquery +populate-persistent-source-schema-bigquery: + hatch -v run bigquery-env:pytest -vv $(ADDITIONAL_PYTEST_OPTIONS) $(USE_PERSISTENT_SOURCE_SCHEMA) $(POPULATE_PERSISTENT_SOURCE_SCHEMA) + .PHONY: test-databricks test-databricks: hatch -v run databricks-env:pytest -vv -n $(PARALLELISM) $(ADDITIONAL_PYTEST_OPTIONS) metricflow/test/ +.PHONY: populate-persistent-source-schema-databricks +populate-persistent-source-schema-databricks: + hatch -v run databricks-env:pytest -vv $(ADDITIONAL_PYTEST_OPTIONS) $(USE_PERSISTENT_SOURCE_SCHEMA) $(POPULATE_PERSISTENT_SOURCE_SCHEMA) + .PHONY: test-redshift test-redshift: hatch -v run redshift-env:pytest -vv -n $(PARALLELISM) $(ADDITIONAL_PYTEST_OPTIONS) metricflow/test/ +.PHONY: populate-persistent-source-schema-redshift +populate-persistent-source-schema-redshift: + hatch -v run redshift-env:pytest -vv $(ADDITIONAL_PYTEST_OPTIONS) $(USE_PERSISTENT_SOURCE_SCHEMA) $(POPULATE_PERSISTENT_SOURCE_SCHEMA) + + .PHONY: test-snowflake test-snowflake: hatch -v run snowflake-env:pytest -vv -n $(PARALLELISM) $(ADDITIONAL_PYTEST_OPTIONS) metricflow/test/ +.PHONY: populate-persistent-source-schema-snowflake +populate-persistent-source-schema-snowflake: + hatch -v run snowflake-env:pytest -vv $(ADDITIONAL_PYTEST_OPTIONS) $(USE_PERSISTENT_SOURCE_SCHEMA) $(POPULATE_PERSISTENT_SOURCE_SCHEMA) + + .PHONY: lint lint: hatch -v run dev-env:pre-commit run --all-files