Export catalogs to CSV #26
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Export catalogs to CSV | |
on: | |
# push: | |
# branches: [ main ] | |
workflow_dispatch: | |
jobs: | |
export-to-csv: | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v2 | |
- name: Set up Python ${{ matrix.python-version }} | |
uses: actions/setup-python@v2 | |
with: | |
python-version: 3.9 | |
- name: Install dependencies | |
run: | | |
python -m pip install --upgrade pip | |
pip install pytest wheel numpy | |
sudo add-apt-repository ppa:ubuntugis/ubuntugis-unstable | |
sudo apt-get update | |
sudo apt-get install gdal-bin python3-gdal | |
sudo apt-get install libgdal-dev | |
pip install GDAL==$(gdal-config --version) --global-option=build_ext --global-option="-I/usr/include/gdal" | |
sudo apt-get install libspatialindex-dev | |
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi | |
- name: Export the catalog of sources as CSV | |
uses: jannekem/run-python-script-action@v1 | |
with: | |
script: | | |
import pandas as pd | |
import os | |
import json | |
CSV_PATH = "./sources.csv" | |
CSV_COLUMNS = [ | |
'mdb_source_id', | |
'data_type', | |
'entity_type', | |
'location.country_code', | |
'location.subdivision_name', | |
'location.municipality', | |
'provider', | |
'name', | |
'note', | |
'static_reference', | |
'urls.direct_download', | |
'urls.authentication_type', | |
'urls.authentication_info', | |
'urls.api_key_parameter_name', | |
'urls.latest', | |
'urls.license', | |
'location.bounding_box.minimum_latitude', | |
'location.bounding_box.maximum_latitude', | |
'location.bounding_box.minimum_longitude', | |
'location.bounding_box.maximum_longitude', | |
'location.bounding_box.extracted_on', | |
'status', | |
'features', | |
'redirects', | |
'redirects_comment' | |
] | |
# tools.constants | |
GTFS = "gtfs" | |
GTFS_RT = "gtfs-rt" | |
MDB_SOURCE_ID = "mdb_source_id" | |
DATA_TYPE = "data_type" | |
LOCATION = "location" | |
COUNTRY_CODE = "country_code" | |
SUBDIVISION_NAME = "subdivision_name" | |
MUNICIPALITY = "municipality" | |
STATIC_REFERENCE = "static_reference" | |
ENTITY_TYPE = "entity_type" | |
UNKNOWN = "unknown" | |
URLS_AUTHENTICATION_TYPE = "urls.authentication_type" | |
FEATURES = "features" | |
REDIRECTS = "redirects" | |
REDIRECTS_COMMENTS = "redirects_comments" | |
# tools.constants.GTFS_SCHEDULE_CATALOG_PATH_FROM_ROOT | |
GTFS_SCHEDULE_CATALOG_PATH_FROM_ROOT = "catalogs/sources/gtfs/schedule" | |
# tools.constants.GTFS_REALTIME_CATALOG_PATH_FROM_ROOT | |
GTFS_REALTIME_CATALOG_PATH_FROM_ROOT = "catalogs/sources/gtfs/realtime" | |
# tools.operations.get_sources | |
gtfs_schedule_catalog_path = os.path.join(".", GTFS_SCHEDULE_CATALOG_PATH_FROM_ROOT) | |
gtfs_realtime_catalog_path = os.path.join(".", GTFS_REALTIME_CATALOG_PATH_FROM_ROOT) | |
catalog = {} | |
count = 0 | |
for catalog_path in [gtfs_schedule_catalog_path, gtfs_realtime_catalog_path]: | |
for path, sub_dirs, files in os.walk(catalog_path): | |
files.insert(0, "test-2001-withRedirects.json") | |
files.insert(0, "test-2002-withTwoRedirects.json") | |
files.insert(0, "test-2000.json") | |
for file in files: | |
if count >= 5: | |
break | |
print(f'Processing: {file}') | |
with open(os.path.join(path, file)) as fp: | |
entity_json = json.load(fp) | |
entity_id = entity_json[MDB_SOURCE_ID] | |
catalog[entity_id] = entity_json | |
count += 1 | |
# Complete the GTFS Realtime Sources: location information from their static reference | |
# and pipe delimited static reference and entity type | |
for source_id, source in catalog.items(): | |
if source.get(DATA_TYPE) == GTFS_RT: | |
if len(source.get(STATIC_REFERENCE, [])) > 0: | |
if catalog.get(source.get(STATIC_REFERENCE)[0], {}).get(LOCATION) is not None: | |
source[LOCATION] = catalog.get(source.get(STATIC_REFERENCE)[0], {}).get(LOCATION) | |
source[STATIC_REFERENCE] = "|".join([str(ref_id) for ref_id in source.get(STATIC_REFERENCE)]) | |
else: | |
source[LOCATION] = {COUNTRY_CODE: UNKNOWN, SUBDIVISION_NAME: UNKNOWN, MUNICIPALITY: UNKNOWN} | |
source[ENTITY_TYPE] = "|".join(source.get(ENTITY_TYPE)) | |
if len(source.get(FEATURES, [])) > 0: | |
source[FEATURES] = "|".join(source.get(FEATURES)) | |
# For redirects, allow strings or integers | |
if len(source.get(REDIRECTS, [])) > 0: | |
source[REDIRECTS] = "|".join(str(item) for item in source.get(REDIRECTS)) | |
else: | |
source[REDIRECTS] = "" | |
if len(source.get(REDIRECTS_COMMENTS, [])) > 0: | |
val = source.get(REDIRECTS_COMMENTS) | |
print(f'comments: {val}') | |
source[REDIRECTS_COMMENTS] = "|".join(source.get(REDIRECTS_COMMENTS)) | |
else: | |
print('redirects comments is empty') | |
catalog[source_id] = source | |
# Sort the catalog and convert it to a list | |
catalog = list(dict(sorted(catalog.items())).values()) | |
# tools.helpers.to_csv | |
path = CSV_PATH | |
columns = CSV_COLUMNS | |
catalog = pd.json_normalize(catalog) | |
tmp = pd.DataFrame() | |
for column in columns: | |
if column in catalog: | |
tmp[column] = catalog[column] | |
catalog = tmp | |
if URLS_AUTHENTICATION_TYPE in catalog: | |
catalog[URLS_AUTHENTICATION_TYPE] = catalog[URLS_AUTHENTICATION_TYPE].astype('Int64') | |
catalog.to_csv(path, sep=",", index=False) | |
- name: Upload the catalog of sources CSV artifact | |
uses: actions/upload-artifact@v1 | |
with: | |
name: sources.csv | |
path: sources.csv | |
# store-csv: | |
# needs: [ export-to-csv ] | |
# runs-on: ubuntu-latest | |
# steps: | |
# - uses: actions/checkout@v2 | |
# - name: Download the catalog of sources CSV artifact | |
# uses: actions/download-artifact@v1 | |
# with: | |
# name: sources.csv | |
# path: sources.csv | |
# - name: Set up and authorize Cloud | |
# uses: google-github-actions/auth@v0 | |
# with: | |
# credentials_json: ${{ secrets.ARCHIVE_DATASET_SA_KEY }} | |
# - name: Upload csv to Google Cloud Storage | |
# id: upload-csv | |
# uses: google-github-actions/upload-cloud-storage@v0 | |
# with: | |
# path: sources.csv | |
# destination: mdb-csv | |
# parent: false |