Skip to content

Collecting statistics from AM dataset (#100) #146

Collecting statistics from AM dataset (#100)

Collecting statistics from AM dataset (#100) #146

name: Test PIE Dataset
on:
push:
branches: [main]
paths:
- "dataset_builders/pie/**"
- "data/datasets/**"
- "tests/dataset_builders/pie/**"
- "tests/fixtures/dataset_builders/pie/**"
- ".github/workflows/test_pie_datasets.yaml"
pull_request:
branches: [main, "release/*"]
paths:
- "dataset_builders/pie/**"
- "data/datasets/**"
- "tests/dataset_builders/pie/**"
- "tests/fixtures/dataset_builders/pie/**"
- ".github/workflows/test_pie_datasets.yaml"
jobs:
collect_modified_datasets: # Job that lists modified datasets
runs-on: ubuntu-latest
outputs:
datasets: ${{ steps.set-datasets.outputs.datasets }}
steps:
- uses: actions/checkout@v4
- name: Get changed dataset files
id: changed-files
uses: tj-actions/changed-files@v44
with:
files_yaml: |
datasets:
- 'dataset_builders/pie/**'
- 'data/datasets/**'
- 'tests/dataset_builders/pie/**'
- 'tests/fixtures/dataset_builders/pie/**'
- name: Determine modified datasets
id: set-datasets # Give it an id to handle step outputs in the outputs key above
# NOTE: Ensure all outputs are prefixed by the same key used above e.g. `datasets_(...)`
# when trying to access the `any_changed` output.
if: steps.changed-files.outputs.datasets_any_changed == 'true'
env:
DATASETS_ALL_CHANGED_FILES: ${{ steps.changed-files.outputs.datasets_all_changed_files }}
run: |
echo "all changed dataset files: $DATASETS_ALL_CHANGED_FILES"
DATASETS=$(ls dataset_builders/pie)
echo "collected datasets: $DATASETS" | tr '\n' ' '
echo "\n"
PREFIXES="dataset_builders/pie/ data/datasets/ tests/dataset_builders/pie/ tests/fixtures/dataset_builders/pie/"
echo "file paths to check for changes: $PREFIXES"
# filter entries in DATASETS for which any prefix+dataset is in DATASETS_ALL_CHANGED_FILES
DATASETS_FILTERED=$(for dataset in ${DATASETS}; do
for prefix in ${PREFIXES}; do
if [[ $DATASETS_ALL_CHANGED_FILES == *"$prefix$dataset"* ]]; then
echo $dataset
break
fi
done
done)
echo "filtered datasets: $DATASETS_FILTERED"
# Define step output named dataset base on ls command transformed to JSON thanks to jq
echo "datasets=$(echo "$DATASETS_FILTERED" | jq -R -s -c 'split("\n")[:-1]')" >> "$GITHUB_OUTPUT"
test_dataset:
runs-on: ubuntu-latest
needs: [ collect_modified_datasets ] # Depends on previous job
if: ${{ needs.collect_modified_datasets.outputs.datasets }}
strategy:
fail-fast: false
matrix:
# List matrix strategy from datasets dynamically collected in the previous job
dataset: ${{fromJson(needs.collect_modified_datasets.outputs.datasets)}}
timeout-minutes: 15
steps:
- run: echo "test dataset ${{matrix.dataset}}"
- name: Check out repository
uses: actions/checkout@v4
- name: Set up python
id: setup-python
uses: actions/setup-python@v5
with:
python-version: '3.9'
cache: 'pip' # caching pip dependencies
- name: Install dependencies
run: |
pip install --upgrade pip
# install dataset requirements
pip install -r dataset_builders/pie/${{matrix.dataset}}/requirements.txt
# check if test requirements file exists and install it
ADD_REQUIREMENTS_FILE=tests/dataset_builders/pie/${{matrix.dataset}}/additional-requirements.txt
test -f $ADD_REQUIREMENTS_FILE && pip install -r $ADD_REQUIREMENTS_FILE
# install pytest and pytest-cov
pip install pytest pytest-cov
- name: Run tests
run: |
pytest \
tests/dataset_builders/pie/${{matrix.dataset}} \
-k "not slow" \
--cov=dataset_builders/pie/${{matrix.dataset}} \
--cov-report=xml:coverage/${{matrix.dataset}}.xml
- name: Upload coverage
uses: codecov/codecov-action@v4
with:
file: coverage/${{matrix.dataset}}.xml
name: ${{matrix.dataset}}
fail_ci_if_error: true
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}