diff --git a/.github/workflows/check_hash.yml b/.github/workflows/check_hash.yml new file mode 100644 index 0000000..7db0509 --- /dev/null +++ b/.github/workflows/check_hash.yml @@ -0,0 +1,32 @@ +name: Compare File Hash Weekly + +on: + schedule: + - cron: '0 0 * * 0' # Runs weekly on Sunday at midnight UTC + +jobs: + compare-hash: + runs-on: ubuntu-latest + steps: + - name: Checkout Repository + uses: actions/checkout@v3 + + - name: Calculate Local File Hash + id: local-hash + run: echo "LOCAL_HASH=$(sha256sum pysquirrel/data/NUTS2021-NUTS2024.xlsx | awk '{print $1}')" >> $GITHUB_ENV + + - name: Download File + run: curl -o most-recent-version.xlsx "https://ec.europa.eu/eurostat/documents/345175/629341/NUTS2021-NUTS2024.xlsx" + + - name: Calculate Downloaded File Hash + id: downloaded-hash + run: echo "DOWNLOADED_HASH=$(sha256sum most-recent-version.xlsx | awk '{print $1}')" >> $GITHUB_ENV + + - name: Compare Hashes + run: | + if [ "$LOCAL_HASH" != "$DOWNLOADED_HASH" ]; then + echo "Hashes do not match!" + exit 1 + else + echo "Hashes match!" + fi diff --git a/docs/updating-nuts.md b/docs/updating-nuts.md new file mode 100644 index 0000000..bfae572 --- /dev/null +++ b/docs/updating-nuts.md @@ -0,0 +1,15 @@ +# Updating the NUTS source file + +EUROSTAT occasionally updates the current NUTS classification spreadsheet. These updates might be minor and not encompass changing region names or codes, but knowing they take place, it is important to ensure the package accesses the most up-to-date version of the data. + +To this end, a weekly GitHub action compares pysquirrel's copy of the file and the version hosted in the EUROSTAT website with a hash check. The workflow fails if hashes differ. + +In such a case, using a local installation of pysquirrel, and with the newest version of the spreadsheet downloaded: + +```python +from pysquirrel.core import nuts_to_yaml + +nuts_to_yaml("path/to/latest_nuts.xlsx", "path/to/output") +``` + +The function will parse the XLSX file and output the two corresponding YAML files (for NUTS regions and Statistical Regions). YAML files allow for easy tracking of changes in GitHub commits. \ No newline at end of file diff --git a/pysquirrel/core.py b/pysquirrel/core.py index 88c21d5..089a7f0 100644 --- a/pysquirrel/core.py +++ b/pysquirrel/core.py @@ -5,16 +5,18 @@ import os import yaml +from openpyxl import load_workbook from pydantic.dataclasses import dataclass # Base path for package code BASE_PATH = Path(__file__).absolute().parent DATA_PATH = BASE_PATH / "data" +COL_NAME_ROW = 1 MIN_DATA_ROW = 2 MAX_DATA_COL = 4 -# utility function +# Utility functions def flatten(lst): for i in lst: if isinstance(i, list): @@ -23,6 +25,26 @@ def flatten(lst): yield i +def nuts_to_yaml(file_path: str, output_dir: str): + """Converts a NUTS .xlsx source file to YAML files.""" + + workbook = load_workbook(file_path, read_only=True, data_only=True) + + for sheet, file in { + "NUTS2024": "NUTS2021-2024.yaml", + "Statistical Regions": "SR2021-2024.yaml", + }.items(): + regions = [] + worksheet = workbook[sheet] + cols = [cell.value for cell in worksheet[1]] + for row in worksheet.iter_rows(min_row=MIN_DATA_ROW, max_col=MAX_DATA_COL): + if all(cell.value for cell in row): + regions.append({col: cell.value for (col, cell) in zip(cols, row)}) + + with open(Path(output_dir) / file, "w") as f: + yaml.dump(regions, f, allow_unicode=True) + + class Level(IntEnum): LEVEL_1 = 1 LEVEL_2 = 2 @@ -116,7 +138,7 @@ def _load(self) -> None: for data_file in os.listdir(DATA_PATH): for region_type, cls in region_class.items(): - if data_file.startswith(region_type): + if data_file.startswith(region_type) and data_file.endswith("yaml"): with open(DATA_PATH / data_file, "r", encoding="utf8") as f: data = yaml.safe_load(f) for region in data: diff --git a/pysquirrel/data/NUTS2021-NUTS2024.xlsx b/pysquirrel/data/NUTS2021-NUTS2024.xlsx new file mode 100644 index 0000000..04a2f8d Binary files /dev/null and b/pysquirrel/data/NUTS2021-NUTS2024.xlsx differ