Skip to content

Commit

Permalink
Add Github workflow to populate the persistent source schema.
Browse files Browse the repository at this point in the history
  • Loading branch information
plypaul committed Aug 8, 2023
1 parent a917a6b commit a739d97
Show file tree
Hide file tree
Showing 3 changed files with 151 additions and 0 deletions.
115 changes: 115 additions & 0 deletions .github/workflows/cd-sql-engine-populate-persistent-source-schema.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# See [Persistent Source Schema](/GLOSSARY.md#persistent-source-schema)
# Populating the source schema via this workflow ensures that it's done with the same settings as the tests.

name: Reload SQL Engine Test Data

# We don't want multiple workflows trying to create the same table.
concurrency:
group: POPULATE_PERSISTENT_SOURCE_SCHEMA
cancel-in-progress: true

on:
push:
branches:
- main
pull_request:
types: [labeled]

env:
PYTHON_VERSION: "3.8"
ADDITIONAL_PYTEST_OPTIONS: "--use-persistent-source-schema"

jobs:
snowflake-populate:
environment: DW_INTEGRATION_TESTS
if: ${{ github.event.action != 'labeled' || github.event.label.name == 'Reload SQL Engine Test Data' }}
name: Snowflake
runs-on: ubuntu-latest
steps:
- name: Check-out the repo
uses: actions/checkout@v3

- name: Populate w/Python ${{ env.PYTHON_VERSION }}
uses: ./.github/actions/run-mf-tests
with:
python-version: ${{ env.PYTHON_VERSION }}
mf_sql_engine_url: ${{ secrets.MF_SNOWFLAKE_URL }}
mf_sql_engine_password: ${{ secrets.MF_SNOWFLAKE_PWD }}
parallelism: 1
additional-pytest-options: ${{ env.ADDITIONAL_PYTEST_OPTIONS }}
make-target: "populate-persistent-source-schema-snowflake"

# redshift-populate:
# environment: DW_INTEGRATION_TESTS
# name: Redshift
# if: ${{ github.event.action != 'labeled' || github.event.label.name == 'populate_persistent_source_schema' }}
# runs-on: ubuntu-latest
# steps:
# - name: Check-out the repo
# uses: actions/checkout@v3
#
# - name: Populate w/Python ${{ env.PYTHON_VERSION }}
# uses: ./.github/actions/run-mf-tests
# with:
# python-version: ${{ env.PYTHON_VERSION }}
# mf_sql_engine_url: ${{ secrets.MF_REDSHIFT_URL }}
# mf_sql_engine_password: ${{ secrets.MF_REDSHIFT_PWD }}
# parallelism: 1
# additional-pytest-options: ${{ env.ADDITIONAL_PYTEST_OPTIONS }}
# make-target: "populate-persistent-source-schema-redshift"
# bigquery-populate:
# environment: DW_INTEGRATION_TESTS
# name: BigQuery
# if: ${{ github.event.action != 'labeled' || github.event.label.name == 'populate_persistent_source_schema' }}
# runs-on: ubuntu-latest
# steps:
# - name: Check-out the repo
# uses: actions/checkout@v3
#
# - name: Populate w/Python ${{ env.PYTHON_VERSION }}
# uses: ./.github/actions/run-mf-tests
# with:
# python-version: ${{ env.PYTHON_VERSION }}
# MF_SQL_ENGINE_URL: ${{ secrets.MF_BIGQUERY_URL }}
# MF_SQL_ENGINE_PASSWORD: ${{ secrets.MF_BIGQUERY_PWD }}
# parallelism: 1
# additional-pytest-options: ${{ env.ADDITIONAL_PYTEST_OPTIONS }}
# make-target: "populate-persistent-source-schema-bigquery"
#
# databricks-cluster-populate:
# environment: DW_INTEGRATION_TESTS
# name: Databricks Cluster
# if: ${{ github.event.action != 'labeled' || github.event.label.name == 'populate_persistent_source_schema' }}
# runs-on: ubuntu-latest
# steps:
# - name: Check-out the repo
# uses: actions/checkout@v3
#
# - name: Populate w/Python Python ${{ env.PYTHON_VERSION }}
# uses: ./.github/actions/run-mf-tests
# with:
# python-version: ${{ env.PYTHON_VERSION }}
# mf_sql_engine_url: ${{ secrets.MF_DATABRICKS_CLUSTER_URL }}
# mf_sql_engine_password: ${{ secrets.MF_DATABRICKS_PWD }}
# parallelism: 1
# additional-pytest-options: ${{ env.ADDITIONAL_PYTEST_OPTIONS }}
# make-target: "populate-persistent-source-schema-databricks"
#
# databricks-sql-warehouse-populate:
# environment: DW_INTEGRATION_TESTS
# name: Databricks SQL Warehouse
# if: ${{ github.event.action != 'labeled' || github.event.label.name == 'populate_persistent_source_schema' }}
# runs-on: ubuntu-latest
# steps:
# - name: Check-out the repo
# uses: actions/checkout@v3
#
# - name: Populate w/Python ${{ env.PYTHON_VERSION }}
# uses: ./.github/actions/run-mf-tests
# with:
# python-version: ${{ env.PYTHON_VERSION }}
# mf_sql_engine_url: ${{ secrets.MF_DATABRICKS_SQL_WAREHOUSE_URL }}
# mf_sql_engine_password: ${{ secrets.MF_DATABRICKS_PWD }}
# parallelism: 1
# additional-pytest-options: ${{ env.ADDITIONAL_PYTEST_OPTIONS }}
# make-target: "populate-persistent-source-schema-databricks"
13 changes: 13 additions & 0 deletions GLOSSARY.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Glossary

## Persistent source schema
Many tests generate and execute SQL that depend on tables containing test data. By default, a
pytest fixture creates a temporary schema and populates it with the tables that are required by
the tests. This schema is referred to the source schema. Creating the source schema (and
the associated tables) can be a slow process for some SQL engines. Since these tables generally
do not change often, functionality was added to use a source schema that is assumed to already
exist when running tests and persists between runs (a persistent source schema). In addition,
functionality was added to create the persistent source schema based on table definitions in the
repo. Because the name of the source schema is generated based on the hash of the data that's
supposed to be in the schema, the creating and populating the persistent source schema should
not be done concurrently as there are race conditions when creating tables and inserting data.
23 changes: 23 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ PARALLELISM = "auto"
# Additional command line options to pass to pytest.
ADDITIONAL_PYTEST_OPTIONS = ""

# Pytest that can populate the persistent source schema
POPULATE_PERSISTENT_SOURCE_SCHEMA = metricflow/test/source_schema_tools.py::populate_source_schema

# Install Hatch package / project manager
.PHONY: install-hatch
install-hatch:
Expand All @@ -23,22 +26,42 @@ test-postgresql:

# Engine-specific test environments. In most cases you should run these with
# `make -e ADDITIONAL_PYTEST_OPTIONS="--use-persistent-source-schema" test-<engine_type>`
#
# See: [Persistent Source Schema](/GLOSSARY.md#persistent-source-schema)
.PHONY: test-bigquery
test-bigquery:
hatch -v run bigquery-env:pytest -vv -n $(PARALLELISM) $(ADDITIONAL_PYTEST_OPTIONS) metricflow/test/

.PHONY: populate-persistent-source-schema-bigquery
populate-persistent-source-schema:
hatch -v run bigquery-env:pytest -vv $(ADDITIONAL_PYTEST_OPTIONS) $(POPULATE_PERSISTENT_SOURCE_SCHEMA)

.PHONY: test-databricks
test-databricks:
hatch -v run databricks-env:pytest -vv -n $(PARALLELISM) $(ADDITIONAL_PYTEST_OPTIONS) metricflow/test/

.PHONY: populate-persistent-source-schema-databricks
populate-persistent-source-schema-databricks:
hatch -v run databricks-env:pytest -vv $(ADDITIONAL_PYTEST_OPTIONS) $(POPULATE_PERSISTENT_SOURCE_SCHEMA)

.PHONY: test-redshift
test-redshift:
hatch -v run redshift-env:pytest -vv -n $(PARALLELISM) $(ADDITIONAL_PYTEST_OPTIONS) metricflow/test/

.PHONY: populate-persistent-source-schema-redshift
populate-persistent-source-schema-redshift:
hatch -v run redshift-env:pytest -vv $(ADDITIONAL_PYTEST_OPTIONS) $(POPULATE_PERSISTENT_SOURCE_SCHEMA)


.PHONY: test-snowflake
test-snowflake:
hatch -v run snowflake-env:pytest -vv -n $(PARALLELISM) $(ADDITIONAL_PYTEST_OPTIONS) metricflow/test/

.PHONY: populate-persistent-source-schema-snowflake
populate-persistent-source-schema-snowflake:
hatch -v run snowflake-env:pytest -vv $(ADDITIONAL_PYTEST_OPTIONS) $(POPULATE_PERSISTENT_SOURCE_SCHEMA)


.PHONY: lint
lint:
hatch -v run dev-env:pre-commit run --all-files
Expand Down

0 comments on commit a739d97

Please sign in to comment.