Skip to content

test workflow per dataset #28

test workflow per dataset

test workflow per dataset #28

name: Test Pie Dataset
on:
push:
branches: [main]
paths:
- "dataset_builders/pie/**"
- "data/datasets/**"
- "tests/dataset_builders/pie/test_**"
- "tests/fixtures/dataset_builders/pie/**"
- ".github/workflows/test_pie_datasets.yaml"
pull_request:
branches: [main, "release/*"]
paths:
- "dataset_builders/pie/**"
- "data/datasets/**"
- "tests/dataset_builders/pie/test_**"
- "tests/fixtures/dataset_builders/pie/**"
- ".github/workflows/test_pie_datasets.yaml"
jobs:
collect_datasets: # Job that list datasets
runs-on: ubuntu-latest
outputs:
datasets: ${{ steps.set-datasets.outputs.datasets }} # generate output name dir by using inner step output
steps:
- uses: actions/checkout@v4
- name: Get changed dataset files
id: changed-files
uses: tj-actions/changed-files@v44
with:
files_yaml: |
datasets:
- 'dataset_builders/pie/**'
- 'data/datasets/**'
- 'tests/dataset_builders/pie/test_**'
- 'tests/fixtures/dataset_builders/pie/**'
- name: Set datasets
id: set-datasets # Give it an id to handle to get step outputs in the outputs key above
# NOTE: Ensure all outputs are prefixed by the same key used above e.g. `dataset_(...)`
# when trying to access the `any_changed` output.
if: steps.changed-files.outputs.datasets_any_changed == 'true'
env:
DATASETS_ALL_CHANGED_FILES: ${{ steps.changed-files.outputs.datasets_all_changed_files }}
run: |
DATASETS=$(ls dataset_builders/pie)
echo "collected datasets: $DATASETS" | tr '\n' ' '
echo "\n"
PREFIXES="dataset_builders/pie/ data/datasets/ tests/dataset_builders/pie/test_ tests/fixtures/dataset_builders/pie/"
echo "file paths to check for changes: $PREFIXES"
# filter entries in DATASETS for which any prefix+dataset is in DATASETS_ALL_CHANGED_FILES
DATASETS_FILTERED=$(for dataset in ${DATASETS}; do
for prefix in ${PREFIXES}; do
if [[ $DATASETS_ALL_CHANGED_FILES == *"$prefix$dataset"* ]]; then
echo $dataset
break
fi
done
done)
echo "filtered datasets: $DATASETS_FILTERED"
# Define step output named dataset base on ls command transformed to JSON thanks to jq
echo "datasets=$(echo "$DATASETS_FILTERED" | jq -R -s -c 'split("\n")[:-1]')" >> "$GITHUB_OUTPUT"
- if: steps.changed-files.outputs.datasets_any_changed != 'true'
run: echo "datasets=[]" >> "$GITHUB_OUTPUT"
test_dataset:
runs-on: ubuntu-latest
needs: [ collect_datasets ] # Depends on previous job
strategy:
matrix:
dataset: ${{fromJson(needs.collect_datasets.outputs.datasets)}} # List matrix strategy from datasets dynamically
timeout-minutes: 10
steps:
- run: echo "test dataset ${{matrix.dataset}}"
- name: Check out repository
uses: actions/checkout@v4
- name: Set up python
id: setup-python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
cache: 'pip' # caching pip dependencies
- name: Install dependencies
run: pip install -r dataset_builders/pie/${{matrix.dataset}}/requirements.txt