Skip to content

Export catalogs to CSV #27

Export catalogs to CSV

Export catalogs to CSV #27

Workflow file for this run

name: Export catalogs to CSV
on:
# push:
# branches: [ main ]
workflow_dispatch:
jobs:
export-to-csv:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: 3.9
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pytest wheel numpy
sudo add-apt-repository ppa:ubuntugis/ubuntugis-unstable
sudo apt-get update
sudo apt-get install gdal-bin python3-gdal
sudo apt-get install libgdal-dev
pip install GDAL==$(gdal-config --version) --global-option=build_ext --global-option="-I/usr/include/gdal"
sudo apt-get install libspatialindex-dev
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Export the catalog of sources as CSV
uses: jannekem/run-python-script-action@v1
with:
script: |
import pandas as pd
import os
import json
CSV_PATH = "./sources.csv"
CSV_COLUMNS = [
'mdb_source_id',
'data_type',
'entity_type',
'location.country_code',
'location.subdivision_name',
'location.municipality',
'provider',
'name',
'note',
'static_reference',
'urls.direct_download',
'urls.authentication_type',
'urls.authentication_info',
'urls.api_key_parameter_name',
'urls.latest',
'urls.license',
'location.bounding_box.minimum_latitude',
'location.bounding_box.maximum_latitude',
'location.bounding_box.minimum_longitude',
'location.bounding_box.maximum_longitude',
'location.bounding_box.extracted_on',
'status',
'features',
'redirects',
'redirects_comment'
]
# tools.constants
GTFS = "gtfs"
GTFS_RT = "gtfs-rt"
MDB_SOURCE_ID = "mdb_source_id"
DATA_TYPE = "data_type"
LOCATION = "location"
COUNTRY_CODE = "country_code"
SUBDIVISION_NAME = "subdivision_name"
MUNICIPALITY = "municipality"
STATIC_REFERENCE = "static_reference"
ENTITY_TYPE = "entity_type"
UNKNOWN = "unknown"
URLS_AUTHENTICATION_TYPE = "urls.authentication_type"
FEATURES = "features"
REDIRECTS = "redirects"
REDIRECTS_COMMENTS = "redirects_comments"
# tools.constants.GTFS_SCHEDULE_CATALOG_PATH_FROM_ROOT
GTFS_SCHEDULE_CATALOG_PATH_FROM_ROOT = "catalogs/sources/gtfs/schedule"
# tools.constants.GTFS_REALTIME_CATALOG_PATH_FROM_ROOT
GTFS_REALTIME_CATALOG_PATH_FROM_ROOT = "catalogs/sources/gtfs/realtime"
# tools.operations.get_sources
gtfs_schedule_catalog_path = os.path.join(".", GTFS_SCHEDULE_CATALOG_PATH_FROM_ROOT)
gtfs_realtime_catalog_path = os.path.join(".", GTFS_REALTIME_CATALOG_PATH_FROM_ROOT)
catalog = {}
count = 0
for catalog_path in [gtfs_schedule_catalog_path, gtfs_realtime_catalog_path]:
for path, sub_dirs, files in os.walk(catalog_path):
files.insert(0, "test-2001-withRedirects.json")
files.insert(0, "test-2002-withTwoRedirects.json")
files.insert(0, "test-2000.json")
for file in files:
if count >= 5:
break
print(f'Processing: {file}')
with open(os.path.join(path, file)) as fp:
entity_json = json.load(fp)
entity_id = entity_json[MDB_SOURCE_ID]
catalog[entity_id] = entity_json
count += 1
# Complete the GTFS Realtime Sources: location information from their static reference
# and pipe delimited static reference and entity type
for source_id, source in catalog.items():
if source.get(DATA_TYPE) == GTFS_RT:
if len(source.get(STATIC_REFERENCE, [])) > 0:
if catalog.get(source.get(STATIC_REFERENCE)[0], {}).get(LOCATION) is not None:
source[LOCATION] = catalog.get(source.get(STATIC_REFERENCE)[0], {}).get(LOCATION)
source[STATIC_REFERENCE] = "|".join([str(ref_id) for ref_id in source.get(STATIC_REFERENCE)])
else:
source[LOCATION] = {COUNTRY_CODE: UNKNOWN, SUBDIVISION_NAME: UNKNOWN, MUNICIPALITY: UNKNOWN}
source[ENTITY_TYPE] = "|".join(source.get(ENTITY_TYPE))
if len(source.get(FEATURES, [])) > 0:
source[FEATURES] = "|".join(source.get(FEATURES))
# For redirects, allow strings or integers
if len(source.get(REDIRECTS, [])) > 0:
source[REDIRECTS] = "|".join(str(item) for item in source.get(REDIRECTS))
else:
source[REDIRECTS] = ""
if len(source.get(REDIRECTS_COMMENTS, [])) > 0:
val = source.get(REDIRECTS_COMMENTS)
print(f'comments before: {val}')
source[REDIRECTS_COMMENTS] = "|".join(source.get(REDIRECTS_COMMENTS))
val = source.get(REDIRECTS_COMMENTS)
print(f'comments after: {val}')
else:
print('redirects comments is empty')
catalog[source_id] = source
# Sort the catalog and convert it to a list
catalog = list(dict(sorted(catalog.items())).values())
# tools.helpers.to_csv
path = CSV_PATH
columns = CSV_COLUMNS
catalog = pd.json_normalize(catalog)
tmp = pd.DataFrame()
for column in columns:
if column in catalog:
tmp[column] = catalog[column]
catalog = tmp
if URLS_AUTHENTICATION_TYPE in catalog:
catalog[URLS_AUTHENTICATION_TYPE] = catalog[URLS_AUTHENTICATION_TYPE].astype('Int64')
catalog.to_csv(path, sep=",", index=False)
- name: Upload the catalog of sources CSV artifact
uses: actions/upload-artifact@v1
with:
name: sources.csv
path: sources.csv
# store-csv:
# needs: [ export-to-csv ]
# runs-on: ubuntu-latest
# steps:
# - uses: actions/checkout@v2
# - name: Download the catalog of sources CSV artifact
# uses: actions/download-artifact@v1
# with:
# name: sources.csv
# path: sources.csv
# - name: Set up and authorize Cloud
# uses: google-github-actions/auth@v0
# with:
# credentials_json: ${{ secrets.ARCHIVE_DATASET_SA_KEY }}
# - name: Upload csv to Google Cloud Storage
# id: upload-csv
# uses: google-github-actions/upload-cloud-storage@v0
# with:
# path: sources.csv
# destination: mdb-csv
# parent: false