From fc032224af2b34691085941f4912bf4a62fcd3a0 Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Wed, 3 Feb 2021 17:35:17 -0500 Subject: [PATCH 01/11] Add package information and file size to downloaded file metadata --- docs/examples/Usage Example.ipynb | 4 +- refdata/cli/store.py | 10 ++--- refdata/config.py | 2 +- refdata/db.py | 20 ++++++++- refdata/store/__init__.py | 2 +- refdata/store/base.py | 73 +++++++++++++++++++++++++++---- tests/store/conftest.py | 3 ++ tests/store/test_local_store.py | 33 +++++++++++--- tests/test_database.py | 30 +++++++++++-- tests/test_version.py | 15 ------- 10 files changed, 150 insertions(+), 42 deletions(-) delete mode 100644 tests/test_version.py diff --git a/docs/examples/Usage Example.ipynb b/docs/examples/Usage Example.ipynb index 9d22784..cf73319 100644 --- a/docs/examples/Usage Example.ipynb +++ b/docs/examples/Usage Example.ipynb @@ -31,9 +31,9 @@ "source": [ "# Create an instance of the local data store with default settings.\n", "\n", - "from refdata.store import LocalStore\n", + "from refdata.store import RefStore\n", "\n", - "refstore = LocalStore()" + "refstore = RefStore()" ] }, { diff --git a/refdata/cli/store.py b/refdata/cli/store.py index d08cbea..10c4d76 100644 --- a/refdata/cli/store.py +++ b/refdata/cli/store.py @@ -10,7 +10,7 @@ import click from refdata.repo import RepositoryManager -from refdata.store.base import LocalStore +from refdata.store.base import RefStore import refdata.cli.util as util @@ -32,7 +32,7 @@ def download_dataset(basedir, db, index, key): """List local store content.""" # Read the index of given. doc = util.read_index(index) if index is not None else None - store = LocalStore(basedir=basedir, repo=RepositoryManager(doc=doc), connect_url=db) + store = RefStore(basedir=basedir, repo=RepositoryManager(doc=doc), connect_url=db) store.download(key) @@ -44,7 +44,7 @@ def list_datasets(basedir, db, index): """List local store content.""" # Read the index of given. doc = util.read_index(index) if index is not None else None - store = LocalStore(basedir=basedir, repo=RepositoryManager(doc=doc), connect_url=db) + store = RefStore(basedir=basedir, repo=RepositoryManager(doc=doc), connect_url=db) util.print_datasets(store.list()) @@ -62,7 +62,7 @@ def remove_dataset(basedir, db, index, force, key): click.confirm(msg, default=True, abort=True) # Read the index of given. doc = util.read_index(index) if index is not None else None - store = LocalStore(basedir=basedir, repo=RepositoryManager(doc=doc), connect_url=db) + store = RefStore(basedir=basedir, repo=RepositoryManager(doc=doc), connect_url=db) store.remove(key) @@ -76,5 +76,5 @@ def show_dataset(basedir, db, index, raw, key): """Show descriptor for downloaded dataset.""" # Read the index of given. doc = util.read_index(index) if index is not None else None - store = LocalStore(basedir=basedir, repo=RepositoryManager(doc=doc), connect_url=db) + store = RefStore(basedir=basedir, repo=RepositoryManager(doc=doc), connect_url=db) util.print_dataset(dataset=store.open(key), raw=raw) diff --git a/refdata/config.py b/refdata/config.py index 9252475..c129778 100644 --- a/refdata/config.py +++ b/refdata/config.py @@ -51,7 +51,7 @@ def AUTO_DOWNLOAD() -> bool: def BASEDIR() -> str: """Get the current value for the environment variable REFDATA_BASEDIR. - If the value is not set (missing or empty) the folder `.refdata` in the + If the value is not set (missing or empty) the folder `refdata` in the OS-specific data cache directory for the current user is used as the default. diff --git a/refdata/db.py b/refdata/db.py index 02af7bd..4c0fd19 100644 --- a/refdata/db.py +++ b/refdata/db.py @@ -9,10 +9,11 @@ metadata management. """ +import datetime import json import uuid -from sqlalchemy import Column, String, create_engine +from sqlalchemy import Column, Integer, String, create_engine from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker from sqlalchemy.types import TypeDecorator, Unicode @@ -54,12 +55,25 @@ def process_result_value(self, value, dialect): return json.loads(value) +def local_time() -> str: + """Get the current time as a string in ISO format. + + Returns + ------- + string + """ + return datetime.datetime.now().isoformat() + + class Dataset(Base): """Descriptor for dataset that has been downloaded to the local data store. Each dataset has two identifier, (i) the identifier that is part of the dataset descriptor (`key`), and (ii) an internal identifier (`dataset_id`). Users will reference datasets by their key. The internal dataset identifier specifies the subfolder under which the dataset files are stored. + + With each downloaded dataset we maintain a reference to the package that + created the local store instance an initiated the download. """ # -- Schema --------------------------------------------------------------- __tablename__ = 'dataset' @@ -67,6 +81,10 @@ class Dataset(Base): dataset_id = Column(String(32), default=DATASET_ID, primary_key=True) key = Column(String(1024), nullable=False, unique=True) descriptor = Column(JsonObject, nullable=False) + package_name = Column(String(256), nullable=False) + package_version = Column(String(256), nullable=False) + created_at = Column(String(26), default=local_time, nullable=False) + filesize = Column(Integer, nullable=False) # -- Database Object ---------------------------------------------------------- diff --git a/refdata/store/__init__.py b/refdata/store/__init__.py index 7ac7a27..3cbdedd 100644 --- a/refdata/store/__init__.py +++ b/refdata/store/__init__.py @@ -5,4 +5,4 @@ # refdata is free software; you can redistribute it and/or modify it under the # terms of the MIT License; see LICENSE file for more details. -from refdata.store.base import LocalStore, download_file # noqa: F401 +from refdata.store.base import LocalStore, RefStore, download_file # noqa: F401 diff --git a/refdata/store/base.py b/refdata/store/base.py index fe1525f..6fde5f1 100644 --- a/refdata/store/base.py +++ b/refdata/store/base.py @@ -20,6 +20,7 @@ from refdata.store.dataset import DatasetHandle from refdata.db import Dataset, DATASET_ID, DB, SessionScope from refdata.repo import RepositoryManager +from refdata.version import __version__ import refdata.config as config import refdata.error as err @@ -28,14 +29,15 @@ class LocalStore: """The local dataset store maintains downloaded datasets on the file system. All datasets are maintained in subfolders of a base directory. By default, - the base directory is in the users home directory under `.refdata`. + the base directory is in the users cache directory under the package name. Information about downloaded datasets is maintaind in an SQLite database `refdata.db` that is created in the base directory. The data file for each downloaded dataset is maintained in a separate subfolder. """ def __init__( - self, basedir: Optional[str] = None, repo: Optional[RepositoryManager] = None, + self, package_name: str, package_version: str, + basedir: Optional[str] = None, repo: Optional[RepositoryManager] = None, auto_download: Optional[bool] = None, connect_url: Optional[str] = None ): """Initialize the base directory on the file system where downloaded @@ -45,13 +47,20 @@ def __init__( Parameters ---------- + package_name: string + Name of the package that created the instance of the local store. + This name is used to associated downloaded datasets in the local + database with the packages that downloaded them. + package_version: string + Version information for the package that created the local store + instance. basedir: string, default=None Path to the directory for downloaded datasets. By default, the directory that is specified in the environment variable REFDATA_BASEDIR - is used or $HOME/.refdata if the environment variable is not set. + is used. If the environment variable is not set an directory under + the OS-specific users cache data directory is used. repo: refdata.repo.RepositoryManager, default=None - Repository manager that is used to access dataset metadata for - downloading datasets. + Associated repository manager. auto_download: bool, default=None If auto download is enabled (True) datasets are downloaded automatically when they are first accessed via `.open()`. If this option is not @@ -65,11 +74,13 @@ def __init__( value is given the default SQLite database is used. If the respective database file does not exist a new database will be created. """ + self.package_name = package_name + self.package_version = package_version # Create the base directory if it does not exist. self.basedir = basedir if basedir else config.BASEDIR() os.makedirs(self.basedir, exist_ok=True) - # Set the repository manager. If none was given the default manager will - # be used when it is first accessed. + # Set the repository manager. If no manager is given it will be + # instantiated when it is first accessed. self.repo = repo # Set the auto download option. Read REFDATA_AUTODOWNLOAD if not no # argument value was given. The default is False. @@ -174,7 +185,10 @@ def download(self, key: str) -> Tuple[str, Dict]: dataset = Dataset( dataset_id=dataset_id, key=key, - descriptor=ds.to_dict() + descriptor=ds.to_dict(), + package_name=self.package_name, + package_version=self.package_version, + filesize=os.stat(dst).st_size ) session.add(dataset) return dataset_id, ds.to_dict() @@ -349,6 +363,49 @@ def repository(self) -> RepositoryManager: return self.repo +class RefStore(LocalStore): + """Default local store for the refdata package. Uses the module name and + package version to set the respective properties of the created local store + instance. + """ + def __init__( + self, basedir: Optional[str] = None, repo: Optional[RepositoryManager] = None, + auto_download: Optional[bool] = None, connect_url: Optional[str] = None + ): + """Initialize the store properties. + + Parameters + ---------- + basedir: string, default=None + Path to the directory for downloaded datasets. By default, the + directory that is specified in the environment variable REFDATA_BASEDIR + is used. If the environment variable is not set an directory under + the OS-specific users cache data directory is used. + repo: refdata.repo.RepositoryManager, default=None + Associated repository manager. + auto_download: bool, default=None + If auto download is enabled (True) datasets are downloaded automatically + when they are first accessed via `.open()`. If this option is not + enabled and an attempt is made to open a datasets that has not yet + been downloaded to the local file syste, an error is raised. If this + argument is not given the value from the environment variable + REFDATA_AUTODOWNLOAD is used or False if the variable is not set. + connect_url: string, default=None + SQLAlchemy database connect Url string. If a value is given it is + assumed that the database exists and has been initialized. If no + value is given the default SQLite database is used. If the respective + database file does not exist a new database will be created. + """ + super(RefStore, self).__init__( + package_name=__name__.split('.')[0], + package_version=__version__, + basedir=basedir, + repo=repo, + auto_download=auto_download, + connect_url=connect_url + ) + + # -- Helper Functions --------------------------------------------------------- def download_file(dataset: DatasetDescriptor, dst: str): diff --git a/tests/store/conftest.py b/tests/store/conftest.py index 5ae6ce4..1324d80 100644 --- a/tests/store/conftest.py +++ b/tests/store/conftest.py @@ -11,11 +11,14 @@ from refdata.repo import RepositoryManager, download_index from refdata.store import LocalStore +from refdata.version import __version__ @pytest.fixture def store(mock_response, tmpdir): return LocalStore( + package_name='refdata_test', + package_version=__version__, basedir=tmpdir, repo=RepositoryManager(doc=download_index('index.json')), auto_download=False diff --git a/tests/store/test_local_store.py b/tests/store/test_local_store.py index 024877e..195b277 100644 --- a/tests/store/test_local_store.py +++ b/tests/store/test_local_store.py @@ -11,8 +11,10 @@ import pytest from refdata.db import Dataset -from refdata.store import LocalStore, download_file +from refdata.store import LocalStore, RefStore, download_file +from refdata.version import __version__ +import refdata import refdata.error as err @@ -59,21 +61,40 @@ def test_local_store_init(tmpdir): """ # First without connection url and no existing database. basedir = os.path.join(tmpdir, 'test') - store = LocalStore(basedir=basedir) + store = LocalStore(package_name='test', package_version='test.1', basedir=basedir) + assert store.package_name == 'test' + assert store.package_version == 'test.1' assert os.path.join(basedir, 'refdata.db') # A seocond call should not re-create the database. To validate this we # create a new dataset and ensure that after re-creating the store that # the database is not empty. with store.db.session() as session: - session.add(Dataset(key='my_key', descriptor={'id': 'my_key'})) - store = LocalStore(basedir=basedir) + session.add( + Dataset( + key='my_key', + descriptor={'id': 'my_key'}, + package_name='test', + package_version='1', + filesize=0 + ) + ) + store = RefStore(basedir=basedir) + assert store.package_name == refdata.__name__.split('.')[0] + assert store.package_version == __version__ with store.db.session() as session: datasets = session.query(Dataset).all() assert len(datasets) == 1 + ds = datasets[0] + assert ds.key == 'my_key' + assert ds.descriptor == {'id': 'my_key'} + assert ds.package_name == 'test' + assert ds.package_version == '1' + assert ds.filesize == 0 + assert ds.created_at is not None # Create the store with a connection string that points to the created # database. dbfile = os.path.join(basedir, 'refdata.db') - store = LocalStore(basedir=basedir, connect_url='sqlite:///{}'.format(dbfile)) + store = RefStore(basedir=basedir, connect_url='sqlite:///{}'.format(dbfile)) with store.db.session() as session: datasets = session.query(Dataset).all() assert len(datasets) == 1 @@ -84,7 +105,7 @@ def test_local_store_repo_manager(mock_response, tmpdir): """ # First without connection url and no existing database. basedir = os.path.join(tmpdir, 'test') - store = LocalStore(basedir=basedir) + store = RefStore(basedir=basedir) # Ensure that the default test repository was created. assert len(store.repository().find()) == 3 # Hack to ensure that the manager is created only once. diff --git a/tests/test_database.py b/tests/test_database.py index 450685e..bdf18b8 100644 --- a/tests/test_database.py +++ b/tests/test_database.py @@ -31,13 +31,37 @@ def test_database_session(): db.init() # -- Tests ---------------------------------------------------------------- with db.session() as session: - session.add(Dataset(key='my_key', descriptor={'id': 'my_key'})) + session.add( + Dataset( + key='my_key', + descriptor={'id': 'my_key'}, + package_name='test', + package_version='1', + filesize=0 + ) + ) with db.session() as session: - session.add(Dataset(key='a_key', descriptor={'id': 'a_key'})) + session.add( + Dataset( + key='a_key', + descriptor={'id': 'a_key'}, + package_name='test', + package_version='1', + filesize=0 + ) + ) with db.session() as session: datasets = session.query(Dataset).all() assert len(datasets) == 2 # Error when creating an entry with duplicate key. with pytest.raises(IntegrityError): with db.session() as session: - session.add(Dataset(key='my_key', descriptor={'id': 'my_key'})) + session.add( + Dataset( + key='my_key', + descriptor={'id': 'my_key'}, + package_name='test', + package_version='1', + filesize=0 + ) + ) diff --git a/tests/test_version.py b/tests/test_version.py deleted file mode 100644 index 3cdbf8f..0000000 --- a/tests/test_version.py +++ /dev/null @@ -1,15 +0,0 @@ -# This file is part of the Reference Data Repository (refdata). -# -# Copyright (C) 2021 New York University. -# -# refdata is free software; you can redistribute it and/or modify it under the -# terms of the MIT License; see LICENSE file for more details. - -"""Unit tests for version information.""" - -from refdata.version import __version__ - - -def test_package_version(): - """Test accessing package version information (for completion).""" - assert __version__ is not None From 18d1787ab65effc1cf00995bf0654a77c79a68c5 Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Wed, 3 Feb 2021 17:46:23 -0500 Subject: [PATCH 02/11] Refactory dataset and dataset loader --- refdata/{loader => dataset}/__init__.py | 4 --- refdata/{store/dataset.py => dataset/base.py} | 9 +++--- refdata/{loader => dataset}/consumer.py | 0 refdata/{loader => dataset}/csv_loader.py | 8 ++--- refdata/{loader => dataset}/json_loader.py | 6 ++-- refdata/{loader/base.py => dataset/loader.py} | 6 ++-- refdata/store/base.py | 2 +- tests/{loader => dataset}/test_consumer.py | 2 +- tests/{loader => dataset}/test_csv_loader.py | 4 +-- tests/dataset/test_dataset_handle.py | 32 +++++++++++++++++++ tests/{loader => dataset}/test_json_loader.py | 6 ++-- ...dataset_handle.py => test_dataset_load.py} | 26 +-------------- 12 files changed, 55 insertions(+), 50 deletions(-) rename refdata/{loader => dataset}/__init__.py (53%) rename refdata/{store/dataset.py => dataset/base.py} (95%) rename refdata/{loader => dataset}/consumer.py (100%) rename refdata/{loader => dataset}/csv_loader.py (94%) rename refdata/{loader => dataset}/json_loader.py (97%) rename refdata/{loader/base.py => dataset/loader.py} (92%) rename tests/{loader => dataset}/test_consumer.py (96%) rename tests/{loader => dataset}/test_csv_loader.py (93%) create mode 100644 tests/dataset/test_dataset_handle.py rename tests/{loader => dataset}/test_json_loader.py (94%) rename tests/store/{test_dataset_handle.py => test_dataset_load.py} (69%) diff --git a/refdata/loader/__init__.py b/refdata/dataset/__init__.py similarity index 53% rename from refdata/loader/__init__.py rename to refdata/dataset/__init__.py index ea27325..35d3926 100644 --- a/refdata/loader/__init__.py +++ b/refdata/dataset/__init__.py @@ -3,7 +3,3 @@ # Copyright (C) 2021 New York University. # # refdata is free software; you can redistribute it and/or modify it under the -# terms of the MIT License; see LICENSE file for more details. - -from refdata.loader.csv_loader import CSVLoader -from refdata.loader.json_loader import JsonLoader diff --git a/refdata/store/dataset.py b/refdata/dataset/base.py similarity index 95% rename from refdata/store/dataset.py rename to refdata/dataset/base.py index 9977008..6c15124 100644 --- a/refdata/store/dataset.py +++ b/refdata/dataset/base.py @@ -11,8 +11,9 @@ import pandas as pd from refdata.base import DatasetDescriptor -from refdata.loader import CSVLoader, JsonLoader -from refdata.loader.consumer import DataConsumer, DataFrameGenerator, DistinctSetGenerator, MappingGenerator +from refdata.dataset.consumer import DataConsumer, DataFrameGenerator, DistinctSetGenerator, MappingGenerator +from refdata.dataset.csv_loader import CSVLoader +from refdata.dataset.json_loader import JsonLoader import refdata.error as err @@ -113,12 +114,12 @@ def load(self, columns: List[str], consumer: DataConsumer) -> DataConsumer: columns: list of string Column identifier defining the content and the schema of the returned data. - consumer: refdata.loader.consumer.DataConsumer + consumer: refdata.dataset.consumer.DataConsumer Consumer for data rows that are being read. Returns ------- - refdata.loader.consumer.DataConsumer + refdata.dataset.consumer.DataConsumer """ # Open the file depending on whether it is compressed or not. By now, # we only support gzip compression. diff --git a/refdata/loader/consumer.py b/refdata/dataset/consumer.py similarity index 100% rename from refdata/loader/consumer.py rename to refdata/dataset/consumer.py diff --git a/refdata/loader/csv_loader.py b/refdata/dataset/csv_loader.py similarity index 94% rename from refdata/loader/csv_loader.py rename to refdata/dataset/csv_loader.py index d6772aa..3f93aaa 100644 --- a/refdata/loader/csv_loader.py +++ b/refdata/dataset/csv_loader.py @@ -11,8 +11,8 @@ import csv -from refdata.loader.base import DatasetLoader -from refdata.loader.consumer import DataConsumer +from refdata.dataset.loader import DatasetLoader +from refdata.dataset.consumer import DataConsumer from refdata.base import FormatDescriptor @@ -65,12 +65,12 @@ def read(self, file: IO, columns: List[str], consumer: DataConsumer) -> DataCons Open file object. columns: list of string Identifier of columns that are contained in the output. - consumer: refdata.loader.consumer.DataConsumer + consumer: refdata.dataset.consumer.DataConsumer Consumer for data rows that are being read. Returns ------- - refdata.loader.consumer.DataConsumer + refdata.dataset.consumer.DataConsumer """ reader = csv.reader(file, delimiter=self.delim) # Skip the first row the it contains the dataset header. diff --git a/refdata/loader/json_loader.py b/refdata/dataset/json_loader.py similarity index 97% rename from refdata/loader/json_loader.py rename to refdata/dataset/json_loader.py index 45fc871..34cbd70 100644 --- a/refdata/loader/json_loader.py +++ b/refdata/dataset/json_loader.py @@ -12,8 +12,8 @@ import json from refdata.base import FormatDescriptor -from refdata.loader.consumer import DataConsumer -from refdata.loader.base import DatasetLoader +from refdata.dataset.consumer import DataConsumer +from refdata.dataset.loader import DatasetLoader class JsonLoader(DatasetLoader): @@ -68,7 +68,7 @@ def read(self, file: IO, columns: List[str], consumer: DataConsumer) -> DataCons columns: list of string Column identifier defining the content and the schema of the returned data. - consumer: refdata.loader.consumer.DataConsumer + consumer: refdata.dataset.consumer.DataConsumer Consumer for data rows that are being read. Returns diff --git a/refdata/loader/base.py b/refdata/dataset/loader.py similarity index 92% rename from refdata/loader/base.py rename to refdata/dataset/loader.py index 089561c..02fd4e8 100644 --- a/refdata/loader/base.py +++ b/refdata/dataset/loader.py @@ -13,7 +13,7 @@ from abc import ABCMeta, abstractmethod from typing import IO, List -from refdata.loader.consumer import DataConsumer +from refdata.dataset.consumer import DataConsumer class DatasetLoader(metaclass=ABCMeta): @@ -44,11 +44,11 @@ def read(self, file: IO, columns: List[str], consumer: DataConsumer) -> DataCons columns: list of string Column identifier defining the content and the schema of the returned data. - consumer: refdata.loader.consumer.DataConsumer + consumer: refdata.dataset.consumer.DataConsumer Consumer for data rows that are being read. Returns ------- - refdata.loader.consumer.DataConsumer + refdata.dataset.consumer.DataConsumer """ raise NotImplementedError() # pragma: no cover diff --git a/refdata/store/base.py b/refdata/store/base.py index 6fde5f1..7c126dd 100644 --- a/refdata/store/base.py +++ b/refdata/store/base.py @@ -17,7 +17,7 @@ import pandas as pd from refdata.base import DatasetDescriptor -from refdata.store.dataset import DatasetHandle +from refdata.dataset.base import DatasetHandle from refdata.db import Dataset, DATASET_ID, DB, SessionScope from refdata.repo import RepositoryManager from refdata.version import __version__ diff --git a/tests/loader/test_consumer.py b/tests/dataset/test_consumer.py similarity index 96% rename from tests/loader/test_consumer.py rename to tests/dataset/test_consumer.py index 4837603..8b6ca4e 100644 --- a/tests/loader/test_consumer.py +++ b/tests/dataset/test_consumer.py @@ -9,7 +9,7 @@ import pytest -from refdata.loader.consumer import DataFrameGenerator, DistinctSetGenerator, MappingGenerator +from refdata.dataset.consumer import DataFrameGenerator, DistinctSetGenerator, MappingGenerator # List of rows for test purposes. diff --git a/tests/loader/test_csv_loader.py b/tests/dataset/test_csv_loader.py similarity index 93% rename from tests/loader/test_csv_loader.py rename to tests/dataset/test_csv_loader.py index 324b31b..6c95b32 100644 --- a/tests/loader/test_csv_loader.py +++ b/tests/dataset/test_csv_loader.py @@ -12,8 +12,8 @@ import pytest from refdata.base import FormatDescriptor -from refdata.loader.consumer import DataCollector -from refdata.loader.csv_loader import CSVLoader +from refdata.dataset.consumer import DataCollector +from refdata.dataset.csv_loader import CSVLoader @pytest.mark.parametrize( diff --git a/tests/dataset/test_dataset_handle.py b/tests/dataset/test_dataset_handle.py new file mode 100644 index 0000000..f5212bc --- /dev/null +++ b/tests/dataset/test_dataset_handle.py @@ -0,0 +1,32 @@ +# This file is part of the Reference Data Repository (refdata). +# +# Copyright (C) 2021 New York University. +# +# refdata is free software; you can redistribute it and/or modify it under the +# terms of the MIT License; see LICENSE file for more details. + +"""Unit tests for the dataset handle.""" + +import pytest + +from refdata.dataset.base import DatasetHandle + +import refdata.error as err + + +def test_format_error(): + """Ensure that the proper error is raised when initializing a dataset + handle with an invalid format identifier. + """ + doc = { + 'id': '0000', + 'url': 'countries.json', + "checksum": "889c264f2ac4629b4998aa8b8b1d4de45890c39c10e24cfd8a017e9924e805c7", + "schema": [{"id": "name"}, {"id": "alpha2Code"}], + "format": { + "type": "unknown", + "parameters": {} + } + } + with pytest.raises(err.InvalidFormatError): + DatasetHandle(doc=doc, datafile='/dev/null') diff --git a/tests/loader/test_json_loader.py b/tests/dataset/test_json_loader.py similarity index 94% rename from tests/loader/test_json_loader.py rename to tests/dataset/test_json_loader.py index 432e1ae..70786c4 100644 --- a/tests/loader/test_json_loader.py +++ b/tests/dataset/test_json_loader.py @@ -9,9 +9,9 @@ import pytest -from refdata.loader.consumer import DataCollector -from refdata.loader.json_loader import JQuery -from refdata.store.dataset import DatasetHandle +from refdata.dataset.base import DatasetHandle +from refdata.dataset.consumer import DataCollector +from refdata.dataset.json_loader import JQuery # -- Loader ------------------------------------------------------------------- diff --git a/tests/store/test_dataset_handle.py b/tests/store/test_dataset_load.py similarity index 69% rename from tests/store/test_dataset_handle.py rename to tests/store/test_dataset_load.py index ab6e399..86d38b1 100644 --- a/tests/store/test_dataset_handle.py +++ b/tests/store/test_dataset_load.py @@ -5,31 +5,7 @@ # refdata is free software; you can redistribute it and/or modify it under the # terms of the MIT License; see LICENSE file for more details. -"""Unit tests for the dataset handle.""" - -import pytest - -from refdata.store.dataset import DatasetHandle - -import refdata.error as err - - -def test_format_error(): - """Ensure that the proper error is raised when initializing a dataset - handle with an invalid format identifier. - """ - doc = { - 'id': '0000', - 'url': 'countries.json', - "checksum": "889c264f2ac4629b4998aa8b8b1d4de45890c39c10e24cfd8a017e9924e805c7", - "schema": [{"id": "name"}, {"id": "alpha2Code"}], - "format": { - "type": "unknown", - "parameters": {} - } - } - with pytest.raises(err.InvalidFormatError): - DatasetHandle(doc=doc, datafile='/dev/null') +"""Unit tests for creating different data objects for downloaded datasets.""" def test_load_distinct(store): From 5eb34edd03dd734258eee6f8bf99315031c6dd73 Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Thu, 4 Feb 2021 11:52:18 -0500 Subject: [PATCH 03/11] Refactor repository manager and index loader --- refdata/cli/repo.py | 3 +- refdata/cli/store.py | 10 +- refdata/cli/util.py | 19 +- refdata/dataset/base.py | 4 + refdata/repo/__init__.py | 9 + refdata/repo/loader.py | 233 ++++++++++++++++++ refdata/{repo.py => repo/manager.py} | 74 +----- refdata/repo/schema.py | 53 ++++ refdata/{ => repo}/schema.yaml | 0 refdata/store/base.py | 31 ++- tests/.files/index.yaml | 56 +++++ tests/repo/test_repo_loader.py | 73 ++++++ tests/repo/test_repo_manager.py | 28 +++ .../test_validate_index.py} | 50 ++-- tests/store/conftest.py | 6 +- tests/test_data_query.py | 5 +- 16 files changed, 534 insertions(+), 120 deletions(-) create mode 100644 refdata/repo/__init__.py create mode 100644 refdata/repo/loader.py rename refdata/{repo.py => repo/manager.py} (55%) create mode 100644 refdata/repo/schema.py rename refdata/{ => repo}/schema.yaml (100%) create mode 100644 tests/.files/index.yaml create mode 100644 tests/repo/test_repo_loader.py create mode 100644 tests/repo/test_repo_manager.py rename tests/{test_data_repo.py => repo/test_validate_index.py} (57%) diff --git a/refdata/cli/repo.py b/refdata/cli/repo.py index 1279a33..51faae5 100644 --- a/refdata/cli/repo.py +++ b/refdata/cli/repo.py @@ -9,7 +9,8 @@ import click -from refdata.repo import RepositoryManager, validate +from refdata.repo.manager import RepositoryManager +from refdata.repo.schema import validate import refdata.cli.util as util diff --git a/refdata/cli/store.py b/refdata/cli/store.py index 10c4d76..db41b32 100644 --- a/refdata/cli/store.py +++ b/refdata/cli/store.py @@ -9,7 +9,7 @@ import click -from refdata.repo import RepositoryManager +from refdata.repo.loader import DictLoader from refdata.store.base import RefStore import refdata.cli.util as util @@ -32,7 +32,7 @@ def download_dataset(basedir, db, index, key): """List local store content.""" # Read the index of given. doc = util.read_index(index) if index is not None else None - store = RefStore(basedir=basedir, repo=RepositoryManager(doc=doc), connect_url=db) + store = RefStore(basedir=basedir, loader=DictLoader(doc=doc), connect_url=db) store.download(key) @@ -44,7 +44,7 @@ def list_datasets(basedir, db, index): """List local store content.""" # Read the index of given. doc = util.read_index(index) if index is not None else None - store = RefStore(basedir=basedir, repo=RepositoryManager(doc=doc), connect_url=db) + store = RefStore(basedir=basedir, loader=DictLoader(doc=doc), connect_url=db) util.print_datasets(store.list()) @@ -62,7 +62,7 @@ def remove_dataset(basedir, db, index, force, key): click.confirm(msg, default=True, abort=True) # Read the index of given. doc = util.read_index(index) if index is not None else None - store = RefStore(basedir=basedir, repo=RepositoryManager(doc=doc), connect_url=db) + store = RefStore(basedir=basedir, loader=DictLoader(doc=doc), connect_url=db) store.remove(key) @@ -76,5 +76,5 @@ def show_dataset(basedir, db, index, raw, key): """Show descriptor for downloaded dataset.""" # Read the index of given. doc = util.read_index(index) if index is not None else None - store = RefStore(basedir=basedir, repo=RepositoryManager(doc=doc), connect_url=db) + store = RefStore(basedir=basedir, loader=DictLoader(doc=doc), connect_url=db) util.print_dataset(dataset=store.open(key), raw=raw) diff --git a/refdata/cli/util.py b/refdata/cli/util.py index 3064e3e..781d966 100644 --- a/refdata/cli/util.py +++ b/refdata/cli/util.py @@ -13,10 +13,9 @@ import click import json -import os from refdata.base import DatasetDescriptor -from refdata.repo import download_index +from refdata.repo.loader import FileLoader, UrlLoader def print_datasets(datasets: List[DatasetDescriptor]): @@ -88,8 +87,11 @@ def print_dataset(dataset: DatasetDescriptor, raw: bool): def read_index(filename: str) -> Dict: - """Read a repository index file. The filename may either reference a file - on the local file system or is expected to be an Url. + """Read a repository index file. + + The filename may either reference a file on the local file system or is + expected to be an Url. Attempts to read a file first and then load the + Url if an error occured while loading the file. Parameters ---------- @@ -101,8 +103,7 @@ def read_index(filename: str) -> Dict: dict """ try: - with open(filename, 'r') as f: - return json.load(f) - except OSError as ex: - print(ex) - return download_index(url=filename) + return FileLoader(filename).load() + except (IOError, OSError): + pass + return UrlLoader(url=filename).load() diff --git a/refdata/dataset/base.py b/refdata/dataset/base.py index 6c15124..88e7a37 100644 --- a/refdata/dataset/base.py +++ b/refdata/dataset/base.py @@ -5,6 +5,10 @@ # refdata is free software; you can redistribute it and/or modify it under the # terms of the MIT License; see LICENSE file for more details. +"""Base classes for handles that provide access to datasets that have been +downloaded to the local data store. +""" + from typing import Dict, List, Optional, Set, Union import gzip diff --git a/refdata/repo/__init__.py b/refdata/repo/__init__.py new file mode 100644 index 0000000..cbacb87 --- /dev/null +++ b/refdata/repo/__init__.py @@ -0,0 +1,9 @@ +# This file is part of the Reference Data Repository (refdata). +# +# Copyright (C) 2021 New York University. +# +# refdata is free software; you can redistribute it and/or modify it under the +# terms of the MIT License; see LICENSE file for more details. + +from refdata.repo.manager import RepositoryManager # noqa: F401 +from refdata.repo.schema import validate # noqa: F401 diff --git a/refdata/repo/loader.py b/refdata/repo/loader.py new file mode 100644 index 0000000..e4d9eba --- /dev/null +++ b/refdata/repo/loader.py @@ -0,0 +1,233 @@ +# This file is part of the Reference Data Repository (refdata). +# +# Copyright (C) 2021 New York University. +# +# refdata is free software; you can redistribute it and/or modify it under the +# terms of the MIT License; see LICENSE file for more details. + +"""The repository loader serves two purposes. First, it provides an interface +for loading (or creating) repository indexes from different sources (e.g., from +an Url or a file on the local file system) and (if required) in different +formats than the official repository index schema. Second, it supports deferred +loading of the dataset index. Deferred loading is used by the local data store +to defer loading of the associated repository index until it is first being +accesed. + +This module provides default implementations for loading a repository index +in the default schema from an Url, a local file, or from a dictionary. It also +provides implementations for the loader of a federated index file and the +loader for the default federated index. +""" + + +from abc import ABCMeta, abstractmethod +from typing import Dict, Optional + +import json +import requests +import yaml + +import refdata.config as config + + +"""Definition of valid file format identifier.""" +FORMAT_JSON = 'json' +FORMAT_YAML = 'yaml' +FILE_FORMATS = [FORMAT_JSON, FORMAT_YAML] + + +class RepositoryIndexLoader(metaclass=ABCMeta): + """Interface for the repository index loader. + + Provides a single method `load` that is expected to return a dictionary + containing a list of dataset descriptors that adheres to the `RepositoryIndex` + structure defined in the repository schema. + + Different implementations will (a) read the data from different sources, + and (b) transform the read data if necessary into the expected format. + """ + @abstractmethod + def load(self) -> Dict: + """Load a repository index from a data source. + + Returns a dictionary that adheres to the `RepositoryIndex` structure + defined in the repository schema. + + Returns + ------- + dict + """ + raise NotImplementedError() # pragma: no cover + + +# -- Helper Functions --------------------------------------------------------- + +class DictLoader(RepositoryIndexLoader): + """Repository index loader that is a wrapper around a dictionary containing + a list of serialized file descriptors. Loading the index will simply return + the wrapped dictionary. + """ + def __init__(self, doc: Dict): + """Initialize the dictionary containing the serialized repository index. + + Parameters + ---------- + doc: dict + Dictionary containing a single element `datasets` with a list of + serialized dataset descriptors. + """ + self.doc = doc + + def load(self) -> Dict: + """Loading the repository index will return the dictionary that was + given when the object was instantiated. + + Returns + ------- + dict + """ + return self.doc + + +class FileLoader(RepositoryIndexLoader): + """Load repository index from a file on the local file system. + + Supports loading files in Json or YAML format. The file format is specified + as as an optional argument with file types being identified as `json` or + `yaml`. If the file format argument is not given an attempt is made to + guess the format from the file suffix. The default format is `json`. + + The file loader currently does not follow references to federated + repositories that may be listed in optional `repositories` element of the + loaded index file. + """ + def __init__(self, filename: str, ftype: Optional[str] = None): + """Initialize the path to the file containing the repository index and + the optional file format identifier. + + Raises a ValueError if an invalid file format is given. Valid format + identifier are `json` or `yaml`. + + Parameters + ---------- + filename: string + Path to file on the file system. + ftype: string, default='json' + Identifier for the file format. + """ + self.filename = filename + self.ftype = get_file_format(ftype=ftype, filename=filename) + + def load(self) -> Dict: + """Read repository index from file. + + Returns + ------- + dict + """ + with open(self.filename, 'r') as f: + if self.ftype == FORMAT_YAML: + return yaml.load(f.read(), Loader=yaml.FullLoader) + else: + return json.load(f) + + +class UrlLoader(RepositoryIndexLoader): + """Repository index loader that reads data from a given Url. + + Uses the Url that is specified in the environment variable *REFDATA_URL* if + no Url is given when the class is initialized. + + Supports loading files in Json or YAML format. The file format is specified + as as an optional argument with file types being identified as `json` or + `yaml`. If the file format argument is not given an attempt is made to + guess the format from the file suffix. The default format is `json`. + + The UrlLoader recursively follows links to federated repositories in the + optional `repositories` list for a read index file. + """ + def __init__(self, url: Optional[str] = None, ftype: Optional[str] = None): + """Initialize the Url for the repository index file and the file format. + + Uses the Url that is specified in the environment variable *REFDATA_URL* + as default. + + Raises a ValueError if an invalid file format is given. Valid format + identifier are `json` or `yaml`. + + Parameters + ---------- + url: string, default=None + Url pointing to the repository index document. + ftype: string, default=None + Identifier for the file format. + """ + self.url = url if url is not None else config.URL() + self.ftype = get_file_format(ftype=ftype, filename=self.url) + + def load(self) -> Dict: + """Download the repository index file from the given Url. + + Recursively follwos references to other repositories in the optional + `repositories` list of the downloaded index file. + + Returns + ------- + dict + """ + # Load the index file. Raises an error if the HTTP request is not + # successful. + r = requests.get(self.url) + r.raise_for_status() + # Load response body depending on the specified file format type. + if self.ftype == FORMAT_YAML: + body = yaml.load(r.content, Loader=yaml.FullLoader) + else: + body = r.json() + # Create the result containing only the dataset descriptors. + datasets = body.get('datasets', list()) + # Recursively read additional federated repositories that are specified + # in the 'repositories' list and add their datasets to the returned + # result. + for url in body.get('repositories', list()): + for obj in UrlLoader(url=url).load().get('datasets'): + datasets.append(obj) + return {'datasets': datasets} + + +# -- Helper Functions --------------------------------------------------------- + +def get_file_format(ftype: str, filename: str) -> str: + """Get the file format identifier. + + If the `ftype` is given it is verified that a valid file format identifier + is specified. Valid format identifier are `json` and `yaml`. If the identifier + is valid it is returned. Otherwise, a ValueError is raised. + + If the format identifier is not given, an attempt is made to *guess* the + format from the suffix of the given file name or Url. Files ending with + `.json` are assumed to be in Json format and files ending in `.yml` or + `.yaml` are assumed to be in YAML format. Files that do not match either + of these suffixes are assumed to be in Json format. + + Parameters + ---------- + ftype: string + Identifier for the file format. The giben value may be None. + filename: string + Path to file on the file system or Url. + + Returns + ------- + string + """ + if ftype is not None: + if ftype not in FILE_FORMATS: + raise ValueError("unknown file format '{}'".format(ftype)) + return ftype + if '.' in filename: + # Get the file suffix. + suffix = filename.lower().split('.')[-1] + if suffix in ['yml', 'yaml']: + return FORMAT_YAML + return FORMAT_JSON diff --git a/refdata/repo.py b/refdata/repo/manager.py similarity index 55% rename from refdata/repo.py rename to refdata/repo/manager.py index ab7cda0..f24221d 100644 --- a/refdata/repo.py +++ b/refdata/repo/manager.py @@ -9,18 +9,10 @@ available for download in the Reference Data Repository. """ -from jsonschema import Draft7Validator, RefResolver from typing import Dict, List, Optional, Set, Union -import importlib.resources as pkg_resources -import os -import requests -import yaml - from refdata.base import DatasetDescriptor -import refdata.config as config - class RepositoryManager: """The repository manager provides the functionality for querying a @@ -28,10 +20,8 @@ class RepositoryManager: By default, the index that the environment variable REFDATA_URL or its default value points to is read. """ - def __init__(self, doc: Optional[Dict] = None): - """Initialize the index of dataset descriptors. If no data is provided - it is read from the value that the environment variable REFDATA_URL - points to. + def __init__(self, doc: Dict): + """Initialize the index of dataset descriptors. Parameters ---------- @@ -39,19 +29,11 @@ def __init__(self, doc: Optional[Dict] = None): Dictionary containing the dataset index. This dictionary is expected to follow the `RepositoryIndex` schema. """ - # Read the default index if no data was given. - doc = doc if doc is not None else download_index(url=config.URL()) # Create dataset index for entries in the read document. self.datasets = dict() for obj in doc.get('datasets', list()): ds = DatasetDescriptor(obj) self.datasets[ds.identifier] = ds - # Read additional repositories that may be specified in the main - # document. - for url in doc.get('repositories', list()): - for obj in download_index(url=url).get('datasets', list()): - ds = DatasetDescriptor(obj) - self.datasets[ds.identifier] = ds def find(self, filter: Optional[Union[str, List[str], Set[str]]] = None) -> List[DatasetDescriptor]: """Query the dataset index. The filter is a single tag or a list of @@ -98,55 +80,3 @@ def get(self, key: str) -> DatasetDescriptor: refdata.base.DatasetDescriptor """ return self.datasets.get(key) - - -# -- Helper Functions --------------------------------------------------------- - -def download_index(url: str) -> Dict: - """Download the repository index file from the given Url. - - Parameters - ---------- - url: string - Url pointing to the repository index document. - - Returns - ------- - dict - """ - r = requests.get(url) - r.raise_for_status() - return r.json() - - -"""Create schema validator for the repository index file.""" -# Make sure that the path to the schema file is a valid URI. Otherwise, errors -# occur (at least on MS Windows environments). Changed based on: -# https://github.com/Julian/jsonschema/issues/398#issuecomment-385130094 -schemafile = 'file:///{}'.format(os.path.abspath(os.path.join(__file__, 'schema.yaml'))) -schema = yaml.safe_load(pkg_resources.open_text(__package__, 'schema.yaml')) -resolver = RefResolver(schemafile, schema) - - -def validate(doc: Dict) -> Draft7Validator: - """Validate the schema for a repository index document. - - The given document is a dictionary containing the repository index. An - error is raised if the referenced document does not satisfy the defined - repository index schema. - - - Parameters - ---------- - doc: dict - Repository index document. - - Raises - ------ - jsonschema.exceptions.ValidationError - """ - validator = Draft7Validator( - schema=schema['definitions']['RepositoryIndex'], - resolver=resolver - ) - validator.validate(doc) diff --git a/refdata/repo/schema.py b/refdata/repo/schema.py new file mode 100644 index 0000000..c8013a1 --- /dev/null +++ b/refdata/repo/schema.py @@ -0,0 +1,53 @@ +# This file is part of the Reference Data Repository (refdata). +# +# Copyright (C) 2021 New York University. +# +# refdata is free software; you can redistribute it and/or modify it under the +# terms of the MIT License; see LICENSE file for more details. + +"""Schema validator for the repository index file. + +The validator is inteded as a tool for developers and data publishers to +validate their data indexes before publishing them. The validator is currently +not used to validate the schema of an index structure that is passed to a +repository manager. +""" + +from jsonschema import Draft7Validator, RefResolver +from typing import Dict + +import importlib.resources as pkg_resources +import os +import yaml + + +# Make sure that the path to the schema file is a valid URI. Otherwise, errors +# occur (at least on MS Windows environments). Changed based on: +# https://github.com/Julian/jsonschema/issues/398#issuecomment-385130094 +schemafile = 'file:///{}'.format(os.path.abspath(os.path.join(__file__, 'schema.yaml'))) +schema = yaml.safe_load(pkg_resources.open_text(__package__, 'schema.yaml')) +resolver = RefResolver(schemafile, schema) + + +def validate(doc: Dict) -> Draft7Validator: + """Validate the schema for a repository index document. + + The given document is a dictionary containing the repository index. An + error is raised if the referenced document does not satisfy the defined + repository index schema. + + + Parameters + ---------- + doc: dict + Repository index document. + + Raises + ------ + jsonschema.exceptions.ValidationError + """ + validator = Draft7Validator( + schema=schema['definitions']['RepositoryIndex'], + resolver=resolver + ) + validator.validate(doc) diff --git a/refdata/schema.yaml b/refdata/repo/schema.yaml similarity index 100% rename from refdata/schema.yaml rename to refdata/repo/schema.yaml diff --git a/refdata/store/base.py b/refdata/store/base.py index 7c126dd..9296241 100644 --- a/refdata/store/base.py +++ b/refdata/store/base.py @@ -19,7 +19,8 @@ from refdata.base import DatasetDescriptor from refdata.dataset.base import DatasetHandle from refdata.db import Dataset, DATASET_ID, DB, SessionScope -from refdata.repo import RepositoryManager +from refdata.repo.loader import RepositoryIndexLoader, UrlLoader +from refdata.repo.manager import RepositoryManager from refdata.version import __version__ import refdata.config as config @@ -37,7 +38,7 @@ class LocalStore: """ def __init__( self, package_name: str, package_version: str, - basedir: Optional[str] = None, repo: Optional[RepositoryManager] = None, + basedir: Optional[str] = None, loader: Optional[RepositoryIndexLoader] = None, auto_download: Optional[bool] = None, connect_url: Optional[str] = None ): """Initialize the base directory on the file system where downloaded @@ -59,8 +60,10 @@ def __init__( directory that is specified in the environment variable REFDATA_BASEDIR is used. If the environment variable is not set an directory under the OS-specific users cache data directory is used. - repo: refdata.repo.RepositoryManager, default=None - Associated repository manager. + loader: refdata.repo.loader.RepositoryIndexLoader, default=None + Loader for a dataset repository index. the loaded index is used to + create an instance of the repository manager that is associated with + the local data store for downloading datasets. auto_download: bool, default=None If auto download is enabled (True) datasets are downloaded automatically when they are first accessed via `.open()`. If this option is not @@ -79,9 +82,11 @@ def __init__( # Create the base directory if it does not exist. self.basedir = basedir if basedir else config.BASEDIR() os.makedirs(self.basedir, exist_ok=True) - # Set the repository manager. If no manager is given it will be - # instantiated when it is first accessed. - self.repo = repo + # Set the repository loader. The repository manager will be instantiated + # when it is first accessed. If no loader is given the default dataset + # index will be loaded for the associated repository manager instance. + self.loader = loader if loader is not None else UrlLoader() + self.repo = None # Set the auto download option. Read REFDATA_AUTODOWNLOAD if not no # argument value was given. The default is False. self.auto_download = auto_download if auto_download is not None else config.AUTO_DOWNLOAD() @@ -359,7 +364,7 @@ def repository(self) -> RepositoryManager: # given when the store was created and this is the firat access to # the manager. if self.repo is None: - self.repo = RepositoryManager() + self.repo = RepositoryManager(doc=self.loader.load()) return self.repo @@ -369,7 +374,7 @@ class RefStore(LocalStore): instance. """ def __init__( - self, basedir: Optional[str] = None, repo: Optional[RepositoryManager] = None, + self, basedir: Optional[str] = None, loader: Optional[RepositoryIndexLoader] = None, auto_download: Optional[bool] = None, connect_url: Optional[str] = None ): """Initialize the store properties. @@ -381,8 +386,10 @@ def __init__( directory that is specified in the environment variable REFDATA_BASEDIR is used. If the environment variable is not set an directory under the OS-specific users cache data directory is used. - repo: refdata.repo.RepositoryManager, default=None - Associated repository manager. + loader: refdata.repo.loader.RepositoryIndexLoader, default=None + Loader for a dataset repository index. the loaded index is used to + create an instance of the repository manager that is associated with + the local data store for downloading datasets. auto_download: bool, default=None If auto download is enabled (True) datasets are downloaded automatically when they are first accessed via `.open()`. If this option is not @@ -400,7 +407,7 @@ def __init__( package_name=__name__.split('.')[0], package_version=__version__, basedir=basedir, - repo=repo, + loader=loader, auto_download=auto_download, connect_url=connect_url ) diff --git a/tests/.files/index.yaml b/tests/.files/index.yaml new file mode 100644 index 0000000..93f1150 --- /dev/null +++ b/tests/.files/index.yaml @@ -0,0 +1,56 @@ +datasets: +- checksum: 8d4c77b84cbe8c6683bbfa9f58c8268455f820b98289b51955dcef87b1d48d60 + compression: gzip + description: Names of cities in the U.S. from the Encyclopaedia Britannica. + format: + parameters: + delim: "\t" + type: csv + id: cities + name: Cities in the U.S. + schema: + - description: City Name + dtype: text + id: city + name: City + - description: U.S. State Name + dtype: text + id: state + name: State + url: http://cities.tsv.gz + webpage: https://www.britannica.com/topic/list-of-cities-and-towns-in-the-United-States-2023068 +- checksum: 889c264f2ac4629b4998aa8b8b1d4de45890c39c10e24cfd8a017e9924e805c7 + description: Information about countries in the world available from the restcountries.eu + project. + format: + parameters: {} + type: json + id: countries + name: REST Countries + schema: + - description: Country name + dtype: text + id: name + name: Name + - description: ISO 3166-1 2-letter country code + dtype: text + id: alpha2Code + name: Country Code (2-letters) + - description: ISO 3166-1 3-letter country code + dtype: text + id: alpha3Code + name: Country Code (3-letters) + - description: Capital city + dtype: text + id: capital + name: Capital + - description: World region + dtype: text + id: region + name: Region + - description: Sub-region within the country region + dtype: text + id: subregion + name: Sub-Region + url: http://countries.json + webpage: https://restcountries.eu/ diff --git a/tests/repo/test_repo_loader.py b/tests/repo/test_repo_loader.py new file mode 100644 index 0000000..20ea090 --- /dev/null +++ b/tests/repo/test_repo_loader.py @@ -0,0 +1,73 @@ +# This file is part of the Reference Data Repository (refdata). +# +# Copyright (C) 2021 New York University. +# +# refdata is free software; you can redistribute it and/or modify it under the +# terms of the MIT License; see LICENSE file for more details. + +"""Unit tests for the repository index loader.""" + +import os +import pytest + +from refdata.repo.loader import DictLoader, FileLoader, UrlLoader +from refdata.repo.loader import FORMAT_JSON, FORMAT_YAML, get_file_format + + +"""Path to index files in the local test file directory.""" +DIR = os.path.dirname(os.path.realpath(__file__)) +DATA_DIR = os.path.join(DIR, '../.files') +JSON_FILE = os.path.join(DATA_DIR, 'index.json') +YAML_FILE = os.path.join(DATA_DIR, 'index.yaml') + +"""Mocked Urls.""" +JSON_URL = 'http://index.json' +YAML_URL = 'http://index.yaml' + + +def test_dictionary_loader(): + """Test the dictionary index loader.""" + doc = {'datasets': []} + assert DictLoader(doc=doc).load() == doc + + +@pytest.mark.parametrize( + 'filename,ftype', + [(YAML_FILE, None), (YAML_FILE, FORMAT_YAML), (JSON_FILE, None), (JSON_FILE, FORMAT_JSON)] +) +def test_file_loader(filename, ftype): + """Test loading the repository index from files in different formats.""" + doc = FileLoader(filename=filename, ftype=ftype).load() + assert len(doc['datasets']) == 2 + + +@pytest.mark.parametrize( + 'ftype,filename,result', + [ + (FORMAT_JSON, 'index.yaml', FORMAT_JSON), + (FORMAT_YAML, 'index.json', FORMAT_YAML), + (None, 'index.json', FORMAT_JSON), + (None, 'index.yml', FORMAT_YAML), + (None, 'index.json.yaml', FORMAT_YAML), + (None, 'index_json_yaml', FORMAT_JSON) + ] +) +def test_guess_file_format(ftype, filename, result): + """Test various cases for guessing the index file format.""" + assert get_file_format(ftype=ftype, filename=filename) == result + + +def test_invalid_file_format(): + """Test error case where an invalid file format identifier is given.""" + with pytest.raises(ValueError): + get_file_format(ftype='unknown', filename='index.json') + + +@pytest.mark.parametrize( + 'url,ftype', + [(YAML_URL, None), (YAML_URL, FORMAT_YAML), (JSON_URL, None), (JSON_URL, FORMAT_JSON)] +) +def test_url_loader(url, ftype, mock_response): + """Test loading the repository index from a Url in different formats.""" + doc = UrlLoader(url=url, ftype=ftype).load() + assert len(doc['datasets']) == 2 diff --git a/tests/repo/test_repo_manager.py b/tests/repo/test_repo_manager.py new file mode 100644 index 0000000..7636fbc --- /dev/null +++ b/tests/repo/test_repo_manager.py @@ -0,0 +1,28 @@ +# This file is part of the Reference Data Repository (refdata). +# +# Copyright (C) 2021 New York University. +# +# refdata is free software; you can redistribute it and/or modify it under the +# terms of the MIT License; see LICENSE file for more details. + +"""Unit tests for the dataset repository.""" + +from refdata.repo.loader import UrlLoader +from refdata.repo.manager import RepositoryManager + + +def test_get_dataset(mock_response): + """Test getting a dataset from the default repository.""" + # Will attempt to download the default repository. The mocked response will + # return the content of the `index.json` file in the test files directory. + repo = RepositoryManager(doc=UrlLoader().load()) + assert repo.get(key='DS1').identifier == 'DS1' + assert repo.get(key='UNKNOWN') is None + + +def test_read_linked_index(mock_response): + """Test reading a federated repository index.""" + repo = RepositoryManager(doc=UrlLoader(url='multi-index.json').load()) + assert len(repo.find()) == 3 + assert repo.get('us_cities') is not None + assert repo.get('cities') is not None diff --git a/tests/test_data_repo.py b/tests/repo/test_validate_index.py similarity index 57% rename from tests/test_data_repo.py rename to tests/repo/test_validate_index.py index 93fd911..e0e053a 100644 --- a/tests/test_data_repo.py +++ b/tests/repo/test_validate_index.py @@ -5,20 +5,13 @@ # refdata is free software; you can redistribute it and/or modify it under the # terms of the MIT License; see LICENSE file for more details. -"""Unit tests for the dataset repository.""" +"""Unit tests for the repository index schema validator.""" from jsonschema.exceptions import ValidationError import pytest -from refdata.repo import RepositoryManager, download_index, validate - - -def test_get_dataset(mock_response): - """Test getting a dataset from the test repository.""" - repo = RepositoryManager() - assert repo.get(key='DS1').identifier == 'DS1' - assert repo.get(key='UNKNOWN') is None +from refdata.repo.schema import validate @pytest.mark.parametrize( @@ -55,15 +48,40 @@ def test_get_dataset(mock_response): } ] ) -def test_invalid_repository_index(doc, mock_response): +def test_invalid_repository_index(doc): """Test error for invalid repository index documents.""" with pytest.raises(ValidationError): validate(doc) -def test_read_linked_index(mock_response): - """Test validating a 'downloaded' repository index document.""" - repo = RepositoryManager(doc=download_index(url='multi-index.json')) - assert len(repo.find()) == 3 - assert repo.get('us_cities') is not None - assert repo.get('cities') is not None +@pytest.mark.parametrize( + 'doc', + [ + { + 'datasets': [ + { + 'id': '0000', + 'url': 'xyz.com', + 'checksum': '0', + 'schema': [{'id': 'C1'}], + 'format': {'type': 'csv', 'parameters': {}} + } + ] + }, + { + 'datasets': [ + { + 'id': '0000', + 'url': 'xyz.com', + 'checksum': '0', + 'schema': [{'id': 'C1'}], + 'format': {'type': 'csv', 'parameters': {}} + } + ], + 'repositories': ['abc.org'] + } + ] +) +def test_valid_repository_index(doc): + """Test correct validation for valid repository index documents.""" + validate(doc) diff --git a/tests/store/conftest.py b/tests/store/conftest.py index 1324d80..e11c95e 100644 --- a/tests/store/conftest.py +++ b/tests/store/conftest.py @@ -9,8 +9,8 @@ import pytest -from refdata.repo import RepositoryManager, download_index -from refdata.store import LocalStore +from refdata.repo.loader import UrlLoader +from refdata.store.base import LocalStore from refdata.version import __version__ @@ -20,6 +20,6 @@ def store(mock_response, tmpdir): package_name='refdata_test', package_version=__version__, basedir=tmpdir, - repo=RepositoryManager(doc=download_index('index.json')), + loader=UrlLoader(url='index.json'), auto_download=False ) diff --git a/tests/test_data_query.py b/tests/test_data_query.py index 577573b..c37a73e 100644 --- a/tests/test_data_query.py +++ b/tests/test_data_query.py @@ -10,7 +10,8 @@ import pytest from refdata.base import DatasetDescriptor -from refdata.repo import RepositoryManager +from refdata.repo.loader import UrlLoader +from refdata.repo.manager import RepositoryManager """Dataset for single dataset match tests.""" @@ -66,6 +67,6 @@ def test_dataset_query(query, result): ) def test_repository_query(query, result, mock_response): """Test querying the text repository.""" - repo = RepositoryManager() + repo = RepositoryManager(doc=UrlLoader().load()) datasets = [ds.identifier for ds in repo.find(filter=query)] assert datasets == result From f5b093eb843aef4d5da6a30b9b826ebcd4f5fd13 Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Thu, 4 Feb 2021 11:54:53 -0500 Subject: [PATCH 04/11] Change created_at timestamp for downloaded files to UTC --- refdata/db.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/refdata/db.py b/refdata/db.py index 4c0fd19..26acc8b 100644 --- a/refdata/db.py +++ b/refdata/db.py @@ -9,10 +9,11 @@ metadata management. """ -import datetime import json import uuid +from datetime import datetime +from dateutil.tz import UTC from sqlalchemy import Column, Integer, String, create_engine from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker @@ -62,7 +63,7 @@ def local_time() -> str: ------- string """ - return datetime.datetime.now().isoformat() + return datetime.now(UTC).isoformat() class Dataset(Base): @@ -83,7 +84,7 @@ class Dataset(Base): descriptor = Column(JsonObject, nullable=False) package_name = Column(String(256), nullable=False) package_version = Column(String(256), nullable=False) - created_at = Column(String(26), default=local_time, nullable=False) + created_at = Column(String(32), default=local_time, nullable=False) filesize = Column(Integer, nullable=False) From c6fa17773f2ecfe90e6919f5b58a12e85562a914 Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Thu, 4 Feb 2021 13:45:52 -0500 Subject: [PATCH 05/11] Include additional dataset metadata in CLI store listing --- refdata/cli/repo.py | 30 +++++++++--- refdata/cli/store.py | 41 ++++++++++++---- refdata/cli/util.py | 32 +++--------- refdata/dataset/base.py | 32 ++++++++++-- refdata/db.py | 3 +- refdata/store/base.py | 73 ++++++++++++++++------------ requirements.txt | 3 ++ setup.py | 5 +- tests/dataset/test_dataset_handle.py | 9 +++- tests/dataset/test_json_loader.py | 9 +++- tests/store/test_local_store.py | 18 +++---- tests/test_database.py | 9 ++-- 12 files changed, 164 insertions(+), 100 deletions(-) diff --git a/refdata/cli/repo.py b/refdata/cli/repo.py index 51faae5..3e95c77 100644 --- a/refdata/cli/repo.py +++ b/refdata/cli/repo.py @@ -8,7 +8,9 @@ """Commands that interact with a repository index.""" import click +import tableprint as tp +from refdata.repo.loader import DictLoader, UrlLoader from refdata.repo.manager import RepositoryManager from refdata.repo.schema import validate @@ -27,9 +29,24 @@ def cli_repo(): @click.option('-i', '--index', required=False, help='Repository index file') def list_repository(index): """List repository index content.""" - # Read the index of given. - doc = util.read_index(index) if index is not None else None - util.print_datasets(RepositoryManager(doc=doc).find()) + # Read the index from the optional file or Url. By default, the index that + # is specified in the environment is loaded. + loader = DictLoader(util.read_index(index)) if index is not None else UrlLoader() + datasets = RepositoryManager(doc=loader.load()).find() + headers = ['Identifier', 'Name', 'Description'] + data = list() + # Maintain the maximum with for each columns. + widths = [len(h) + 1 for h in headers] + # Sort datasets by name before output. + for dataset in sorted(datasets, key=lambda d: d.name): + desc = dataset.description if dataset.description is not None else '' + row = [dataset.identifier, dataset.name, desc] + for i in range(len(row)): + w = len(row[i]) + 1 + if w > widths[i]: + widths[i] = w + data.append(row) + tp.table(data, headers=headers, width=widths, style='grid', out=util.TPrinter()) @cli_repo.command(name='show') @@ -38,9 +55,10 @@ def list_repository(index): @click.argument('key') def show_dataset(index, raw, key): """Show dataset descriptor from repository index.""" - # Read the index of given. - doc = util.read_index(index) if index is not None else None - util.print_dataset(dataset=RepositoryManager(doc=doc).get(key), raw=raw) + # Read the index from the optional file or Url. By default, the index that + # is specified in the environment is loaded. + loader = DictLoader(util.read_index(index)) if index is not None else UrlLoader() + util.print_dataset(dataset=RepositoryManager(doc=loader.load()).get(key), raw=raw) @cli_repo.command(name='validate') diff --git a/refdata/cli/store.py b/refdata/cli/store.py index db41b32..ec3a27c 100644 --- a/refdata/cli/store.py +++ b/refdata/cli/store.py @@ -7,9 +7,12 @@ """Commands that interact with the local data store.""" +from datasize import DataSize + import click +import tableprint as tp -from refdata.repo.loader import DictLoader +from refdata.repo.loader import DictLoader, UrlLoader from refdata.store.base import RefStore import refdata.cli.util as util @@ -31,8 +34,8 @@ def cli_store(): def download_dataset(basedir, db, index, key): """List local store content.""" # Read the index of given. - doc = util.read_index(index) if index is not None else None - store = RefStore(basedir=basedir, loader=DictLoader(doc=doc), connect_url=db) + loader = DictLoader(util.read_index(index)) if index is not None else UrlLoader() + store = RefStore(basedir=basedir, loader=loader, connect_url=db) store.download(key) @@ -43,9 +46,27 @@ def download_dataset(basedir, db, index, key): def list_datasets(basedir, db, index): """List local store content.""" # Read the index of given. - doc = util.read_index(index) if index is not None else None - store = RefStore(basedir=basedir, loader=DictLoader(doc=doc), connect_url=db) - util.print_datasets(store.list()) + loader = DictLoader(util.read_index(index)) if index is not None else UrlLoader() + store = RefStore(basedir=basedir, loader=loader, connect_url=db) + datasets = store.list() + headers = ['Name', 'Size', 'Downloaded', 'Package'] + data = list() + # Maintain the maximum with for each columns. + widths = [len(h) + 1 for h in headers] + # Sort datasets by name before output. + for dataset in sorted(datasets, key=lambda d: d.name): + row = [ + dataset.identifier, + '{:.2a}'.format(DataSize(dataset.filesize)), + ' '.join(dataset.created_at.isoformat()[:19].split('T')), + '{} {}'.format(dataset.package_name, dataset.package_version) + ] + for i in range(len(row)): + w = len(row[i]) + 1 + if w > widths[i]: + widths[i] = w + data.append(row) + tp.table(data, headers=headers, width=widths, style='grid', out=util.TPrinter()) @cli_store.command(name='remove') @@ -61,8 +82,8 @@ def remove_dataset(basedir, db, index, force, key): msg = "Do you really want to remove dataset '{}'".format(key) click.confirm(msg, default=True, abort=True) # Read the index of given. - doc = util.read_index(index) if index is not None else None - store = RefStore(basedir=basedir, loader=DictLoader(doc=doc), connect_url=db) + loader = DictLoader(util.read_index(index)) if index is not None else UrlLoader() + store = RefStore(basedir=basedir, loader=loader, connect_url=db) store.remove(key) @@ -75,6 +96,6 @@ def remove_dataset(basedir, db, index, force, key): def show_dataset(basedir, db, index, raw, key): """Show descriptor for downloaded dataset.""" # Read the index of given. - doc = util.read_index(index) if index is not None else None - store = RefStore(basedir=basedir, loader=DictLoader(doc=doc), connect_url=db) + loader = DictLoader(util.read_index(index)) if index is not None else UrlLoader() + store = RefStore(basedir=basedir, loader=loader, connect_url=db) util.print_dataset(dataset=store.open(key), raw=raw) diff --git a/refdata/cli/util.py b/refdata/cli/util.py index 781d966..f1e7df0 100644 --- a/refdata/cli/util.py +++ b/refdata/cli/util.py @@ -9,7 +9,7 @@ line interface. """ -from typing import Dict, List +from typing import Dict import click import json @@ -18,31 +18,13 @@ from refdata.repo.loader import FileLoader, UrlLoader -def print_datasets(datasets: List[DatasetDescriptor]): - """Print a listing of datasets to the console. +class TPrinter: + """Wrapper around `click.echo` for table printing.""" + def write(self, s): + click.echo(s) - Outputs the identifier, name and description for each dataset in the given - list. Datasets are sorted by their name. - - Parameters - ---------- - datasets: list of refdata.base.DatasetDescriptor - List of dataset descriptors. - """ - # Compute maximal length of values for the dataset identifier, name and - # description. The length values are used to align the output. - id_len = max([len(d.identifier) for d in datasets] + [10]) - name_len = max([len(d.name) for d in datasets] + [4]) - desc_len = max([len(d.description) for d in datasets if d.description is not None] + [11]) - # Create the output template with all values left aligned. - template = '{:<' + str(id_len) + '} | {:<' + str(name_len) + '} | {:<' + str(desc_len) + '}' - click.echo() - click.echo(template.format('Identifier', 'Name', 'Description')) - click.echo(template.format('-' * id_len, '-' * name_len, '-' * desc_len)) - # Sort datasets by name before output. - for dataset in sorted(datasets, key=lambda d: d.name): - desc = dataset.description if dataset.description is not None else '' - click.echo(template.format(dataset.identifier, dataset.name, desc)) + def flush(self): + pass def print_dataset(dataset: DatasetDescriptor, raw: bool): diff --git a/refdata/dataset/base.py b/refdata/dataset/base.py index 88e7a37..ea3b683 100644 --- a/refdata/dataset/base.py +++ b/refdata/dataset/base.py @@ -9,9 +9,12 @@ downloaded to the local data store. """ +from dateutil.parser import isoparse from typing import Dict, List, Optional, Set, Union +import datetime import gzip +import os import pandas as pd from refdata.base import DatasetDescriptor @@ -26,19 +29,34 @@ class DatasetHandle(DatasetDescriptor): """Handle for a dataset in the local data store. Provides the functionality to read data in different formats from the downloaded data file. """ - def __init__(self, doc: Dict, datafile: str): + def __init__( + self, descriptor: Dict, package_name: str, package_version: str, + created_at: datetime.datetime, datafile: str + ): """Initialize the descriptor information and the path to the downloaded - data file. This will also create an instance of the dataset loader that - is dependent on the dataset format. + data file. + + This will also create an instance of the dataset loader that is used + for reading the data file dependent on the dataset format. Parameters ---------- - doc: dict + descriptor: dict Dictionary serialization for the dataset descriptor. + package_name: string + Name of the package that downloaded the dataset. + package_version: string + Version information for the package that downloaded the dataset. + created_at: str + Timestamp (in UTC) when the dataset was downloaded. datafile: string Path to the downloaded file. """ - super(DatasetHandle, self).__init__(doc=doc) + super(DatasetHandle, self).__init__(doc=descriptor) + self.package_name = package_name + self.package_version = package_version + # Convert the timestamp from UTC to local time. + self.created_at = isoparse(created_at).astimezone() self.datafile = datafile # Create the format-dependent instance of the dataset loader. parameters = self.format @@ -103,6 +121,10 @@ def distinct(self, columns: Optional[Union[str, List[str]]] = None) -> Set: columns = columns if isinstance(columns, list) else [columns] return self.load(columns=columns, consumer=DistinctSetGenerator()).to_set() + @property + def filesize(self) -> int: + return os.stat(self.datafile).st_size + def load(self, columns: List[str], consumer: DataConsumer) -> DataConsumer: """Load data for the specified columns from the downloaded dataset file. diff --git a/refdata/db.py b/refdata/db.py index 26acc8b..9ec8c79 100644 --- a/refdata/db.py +++ b/refdata/db.py @@ -14,7 +14,7 @@ from datetime import datetime from dateutil.tz import UTC -from sqlalchemy import Column, Integer, String, create_engine +from sqlalchemy import Column, String, create_engine from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker from sqlalchemy.types import TypeDecorator, Unicode @@ -85,7 +85,6 @@ class Dataset(Base): package_name = Column(String(256), nullable=False) package_version = Column(String(256), nullable=False) created_at = Column(String(32), default=local_time, nullable=False) - filesize = Column(Integer, nullable=False) # -- Database Object ---------------------------------------------------------- diff --git a/refdata/store/base.py b/refdata/store/base.py index 9296241..7ce0b15 100644 --- a/refdata/store/base.py +++ b/refdata/store/base.py @@ -11,7 +11,7 @@ from pooch.core import stream_download from pooch.downloaders import choose_downloader -from typing import Dict, List, Optional, Set, Tuple, Union +from typing import Dict, List, Optional, Set, Union import os import pandas as pd @@ -141,11 +141,12 @@ def distinct( dataset = self.open(key=key, auto_download=auto_download) return dataset.distinct(columns=columns) - def download(self, key: str) -> Tuple[str, Dict]: - """Download the dataset with the given (external) identifier. If no - dataset with that given key exists an error is raised. If the - dataset had been downloaded before the existing data file is - downloaded again. + def download(self, key: str) -> DatasetHandle: + """Download the dataset with the given (external) identifier. + + Returns the handle for the downloaded dataset. If no dataset with that + given key exists an error is raised. If the dataset had been downloaded + before the data file is downloaded again. Returns the internal identifier and the descriptor (serialization) for the downloaded dataset. @@ -157,15 +158,15 @@ def download(self, key: str) -> Tuple[str, Dict]: Returns ------- - string, dict + refdata.dataset.base.DatasetHandle Raises ------ refdata.error.UnknownDatasetError """ # Get the dataset descriptor from the repository. - ds = self.repository().get(key=key) - if ds is None: + descriptor = self.repository().get(key=key) + if descriptor is None: raise err.UnknownDatasetError(key=key) # Get the internal dataset identifier if the dataset had been # downloaded before. If the dataset had not been downloaded an new @@ -182,7 +183,7 @@ def download(self, key: str) -> Tuple[str, Dict]: # will raise an error if the checksum for the downloaded file does not # match the expected checksum from the repository index. dst = self._datafile(dataset_id) - download_file(dataset=ds, dst=dst) + download_file(dataset=descriptor, dst=dst) # Create entry for the downloaded dataset if it was downloaded for # the first time. if not ds_exists: @@ -190,13 +191,12 @@ def download(self, key: str) -> Tuple[str, Dict]: dataset = Dataset( dataset_id=dataset_id, key=key, - descriptor=ds.to_dict(), + descriptor=descriptor.to_dict(), package_name=self.package_name, - package_version=self.package_version, - filesize=os.stat(dst).st_size + package_version=self.package_version ) session.add(dataset) - return dataset_id, ds.to_dict() + return self.open(key=key) def _get(self, session: SessionScope, key: str) -> Dataset: """Get the database object for the dataset with the given key. If @@ -213,17 +213,25 @@ def _get(self, session: SessionScope, key: str) -> Dataset: """ return session.query(Dataset).filter(Dataset.key == key).one_or_none() - def list(self) -> List[DatasetDescriptor]: + def list(self) -> List[DatasetHandle]: """Get the descriptors for all datasets that have been downloaded and are available from the local dataset store. Returns ------- - list of refdata.base.DatasetDescriptor + list of refdata.dataset.base.DatasetHandle """ with self.db.session() as session: datasets = session.query(Dataset).all() - return [DatasetDescriptor(ds.descriptor) for ds in datasets] + return [ + DatasetHandle( + descriptor=d.descriptor, + package_name=d.package_name, + package_version=d.package_version, + created_at=d.created_at, + datafile=self._datafile(d.dataset_id) + ) for d in datasets + ] def load( self, key: str, columns: Optional[List[str]] = None, @@ -302,31 +310,32 @@ def open(self, key: str, auto_download: Optional[bool] = None) -> DatasetHandle: Returns ------- - refdata.dataset.DatasetHandle + refdata.dataset.base.DatasetHandle Raises ------ refdata.error.NotDownloadedError """ - # Get the identifier and descriptor for the dataset. Raises error - # if dataset has not been downloaded and auto_download is False. - dataset_id, descriptor = None, None + # Return the dataset handle if the dataset has been downloaded before. with self.db.session() as session: dataset = self._get(session=session, key=key) if dataset is not None: - dataset_id = dataset.dataset_id - descriptor = dataset.descriptor + return DatasetHandle( + descriptor=dataset.descriptor, + package_name=dataset.package_name, + package_version=dataset.package_version, + created_at=dataset.created_at, + datafile=self._datafile(dataset.dataset_id) + ) # Attempt to download if it does not exist in the local store and either # of the given auto_download flag or the class global auto_download is - # True. - if dataset_id is None: - download = auto_download if auto_download is not None else self.auto_download - if download: - dataset_id, descriptor = self.download(key=key) - else: - raise err.NotDownloadedError(key=key) - # Return handle for the dataset. - return DatasetHandle(doc=descriptor, datafile=self._datafile(dataset_id)) + # True. Raises error if dataset has not been downloaded and + # auto_download is False. + download = auto_download if auto_download is not None else self.auto_download + if download: + return self.download(key=key) + else: + raise err.NotDownloadedError(key=key) def remove(self, key: str) -> bool: """Remove the dataset with the given (external) identifier from the diff --git a/requirements.txt b/requirements.txt index ecf96c6..b8770d8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,12 @@ future appdirs>=1.4.4 pandas>=1.0.0 +python-dateutil +datasize>=1.0.0 pyyaml>=5.1 jsonschema SQLAlchemy>=1.3.18 pooch>=1.3.0 requests Click>=7.0.0 +tableprint diff --git a/setup.py b/setup.py index 7614174..19f2043 100644 --- a/setup.py +++ b/setup.py @@ -17,12 +17,15 @@ 'future', 'appdirs>=1.4.4', 'pandas>=1.0.0', + 'python-dateutil', + 'datasize>=1.0.0', 'pyyaml>=5.1', 'jsonschema', 'SQLAlchemy>=1.3.18', 'pooch>=1.3.0', 'requests', - 'Click>=7.0.0' + 'Click>=7.0.0', + 'tableprint' ] diff --git a/tests/dataset/test_dataset_handle.py b/tests/dataset/test_dataset_handle.py index f5212bc..aface7f 100644 --- a/tests/dataset/test_dataset_handle.py +++ b/tests/dataset/test_dataset_handle.py @@ -10,6 +10,7 @@ import pytest from refdata.dataset.base import DatasetHandle +from refdata.db import local_time import refdata.error as err @@ -29,4 +30,10 @@ def test_format_error(): } } with pytest.raises(err.InvalidFormatError): - DatasetHandle(doc=doc, datafile='/dev/null') + DatasetHandle( + descriptor=doc, + package_name='test', + package_version='0', + created_at=local_time(), + datafile='/dev/null' + ) diff --git a/tests/dataset/test_json_loader.py b/tests/dataset/test_json_loader.py index 70786c4..6294ae5 100644 --- a/tests/dataset/test_json_loader.py +++ b/tests/dataset/test_json_loader.py @@ -12,6 +12,7 @@ from refdata.dataset.base import DatasetHandle from refdata.dataset.consumer import DataCollector from refdata.dataset.json_loader import JQuery +from refdata.db import local_time # -- Loader ------------------------------------------------------------------- @@ -60,7 +61,13 @@ def test_json_loader(parameters, columns, first_row, countries_file, mock_response): descriptor = dict(DESCRIPTOR) descriptor['format'] = parameters - dataset = DatasetHandle(doc=descriptor, datafile=countries_file) + dataset = DatasetHandle( + descriptor=descriptor, + package_name='test', + package_version='0', + created_at=local_time(), + datafile=countries_file + ) data = dataset.load(columns=columns, consumer=DataCollector()).data assert len(data) == 2 assert data[0] == first_row diff --git a/tests/store/test_local_store.py b/tests/store/test_local_store.py index 195b277..6fc6cb6 100644 --- a/tests/store/test_local_store.py +++ b/tests/store/test_local_store.py @@ -20,15 +20,13 @@ def test_download_dataset(store): """Test downloading datasets to the local store.""" - dataset_id, descriptor = store.download(key='cities') - assert dataset_id is not None - assert descriptor['id'] == 'cities' - assert os.path.isfile(store._datafile(dataset_id)) + dataset = store.download(key='cities') + assert dataset.identifier == 'cities' + assert os.path.isfile(dataset.datafile) # No issue downloading the datset again. - dataset_id, descriptor = store.download(key='cities') - assert dataset_id is not None - assert descriptor['id'] == 'cities' - assert os.path.isfile(store._datafile(dataset_id)) + dataset = store.download(key='cities') + assert dataset.identifier == 'cities' + assert os.path.isfile(dataset.datafile) # Error when downloading unkown file. with pytest.raises(err.UnknownDatasetError): store.download(key='unknown') @@ -74,8 +72,7 @@ def test_local_store_init(tmpdir): key='my_key', descriptor={'id': 'my_key'}, package_name='test', - package_version='1', - filesize=0 + package_version='1' ) ) store = RefStore(basedir=basedir) @@ -89,7 +86,6 @@ def test_local_store_init(tmpdir): assert ds.descriptor == {'id': 'my_key'} assert ds.package_name == 'test' assert ds.package_version == '1' - assert ds.filesize == 0 assert ds.created_at is not None # Create the store with a connection string that points to the created # database. diff --git a/tests/test_database.py b/tests/test_database.py index bdf18b8..6bc33c2 100644 --- a/tests/test_database.py +++ b/tests/test_database.py @@ -36,8 +36,7 @@ def test_database_session(): key='my_key', descriptor={'id': 'my_key'}, package_name='test', - package_version='1', - filesize=0 + package_version='1' ) ) with db.session() as session: @@ -46,8 +45,7 @@ def test_database_session(): key='a_key', descriptor={'id': 'a_key'}, package_name='test', - package_version='1', - filesize=0 + package_version='1' ) ) with db.session() as session: @@ -61,7 +59,6 @@ def test_database_session(): key='my_key', descriptor={'id': 'my_key'}, package_name='test', - package_version='1', - filesize=0 + package_version='1' ) ) From 71e8a5733ebf3b09da526b40cac233657ce7f26d Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Thu, 4 Feb 2021 13:50:18 -0500 Subject: [PATCH 06/11] Update version and changelog --- changelog.md | 7 +++++++ refdata/version.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/changelog.md b/changelog.md index 3927e01..f3eb572 100644 --- a/changelog.md +++ b/changelog.md @@ -8,3 +8,10 @@ ### 0.1.1 - 2020-02-03 * Use `appdirs.user_cache_dir` as parent directory for the default target directory for downloaded files (\#5). + + +### 0.2.0 - 2020-02-04 + +* Repository index loader for different data sources. It is now possible to load the repository index from an Url, a locak file, or directly from a given dictionary. +* Support loading index files in Json or YAML format. +* Add package information and timestamp for downloaded datasets. diff --git a/refdata/version.py b/refdata/version.py index 2110efc..e4442e7 100644 --- a/refdata/version.py +++ b/refdata/version.py @@ -6,4 +6,4 @@ # terms of the MIT License; see LICENSE file for more details. """Information about the current version of the refdata package.""" -__version__ = '0.1.1' +__version__ = '0.2.0' From 5e2f082cfea8b93459c7020755cc2fb8a40f7d7c Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Thu, 4 Feb 2021 13:54:07 -0500 Subject: [PATCH 07/11] Update example notebook --- docs/examples/Usage Example.ipynb | 59 ++++++++----------------------- 1 file changed, 15 insertions(+), 44 deletions(-) diff --git a/docs/examples/Usage Example.ipynb b/docs/examples/Usage Example.ipynb index cf73319..bdbff79 100644 --- a/docs/examples/Usage Example.ipynb +++ b/docs/examples/Usage Example.ipynb @@ -89,51 +89,19 @@ "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "('a023b7d5233a4d35a15a11b2ec8b9cfa',\n", - " {'id': 'restcountries.eu',\n", - " 'name': 'REST Countries',\n", - " 'description': 'Information about countries in the world available from the restcountries.eu project.',\n", - " 'url': 'https://raw.githubusercontent.com/VIDA-NYU/openclean-reference-data/master/data/restcountries.eu.json',\n", - " 'checksum': '5893ebfad649533ac82a0b030a24efdd519f95a8b030a5ac9c7df37e85aad005',\n", - " 'webpage': 'https://restcountries.eu/',\n", - " 'schema': [{'id': 'name',\n", - " 'name': 'Name',\n", - " 'description': 'Country name',\n", - " 'dtype': 'text'},\n", - " {'id': 'alpha2Code',\n", - " 'name': 'Country Code (2-letters)',\n", - " 'description': 'ISO 3166-1 2-letter country code',\n", - " 'dtype': 'text'},\n", - " {'id': 'alpha3Code',\n", - " 'name': 'Country Code (3-letters)',\n", - " 'description': 'ISO 3166-1 3-letter country code',\n", - " 'dtype': 'text'},\n", - " {'id': 'capital',\n", - " 'name': 'Capital',\n", - " 'description': 'Capital city',\n", - " 'dtype': 'text'},\n", - " {'id': 'region',\n", - " 'name': 'Region',\n", - " 'description': 'World region',\n", - " 'dtype': 'text'},\n", - " {'id': 'subregion',\n", - " 'name': 'Sub-Region',\n", - " 'description': 'Sub-region within the country region',\n", - " 'dtype': 'text'}],\n", - " 'format': {'type': 'json', 'parameters': {}}})" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "downloaded dataset restcountries.eu (size 316025 bytes).\n" + ] } ], "source": [ "# Download the restcountries dataset\n", "\n", - "refstore.download('restcountries.eu')" + "dataset = refstore.download('restcountries.eu')\n", + "\n", + "print('downloaded dataset {} (size {} bytes).'.format(dataset.identifier, dataset.filesize))" ] }, { @@ -148,8 +116,9 @@ "text": [ "Downloaded datasets:\n", "\n", - "> REST Countries (id=restcountries.eu)\n", - "> Cities in the U.S. (id=encyclopaedia_britannica:us_cities)\n" + "> C1 Street Suffix Abbreviations (id=usps:street_abbrev)\n", + "> Cities in the U.S. (id=encyclopaedia_britannica:us_cities)\n", + "> REST Countries (id=restcountries.eu)\n" ] } ], @@ -274,7 +243,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "[]\n" + "usps:street_abbrev\n", + "encyclopaedia_britannica:us_cities\n" ] } ], @@ -283,7 +253,8 @@ "\n", "refstore.remove('restcountries.eu')\n", "\n", - "print(refstore.list())" + "for dataset in refstore.list():\n", + " print(dataset.identifier)" ] }, { From bab8f19972d6afd5f5a1eed3fd08052487269d6a Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Tue, 16 Feb 2021 16:10:47 -0500 Subject: [PATCH 08/11] Refactor load and open methods for datasets and the local data store --- refdata/cli/store.py | 2 +- refdata/dataset/base.py | 18 ++++- refdata/store/base.py | 124 +++++++++---------------------- tests/store/test_dataset_load.py | 23 ++++-- tests/store/test_local_store.py | 32 +++++--- 5 files changed, 87 insertions(+), 112 deletions(-) diff --git a/refdata/cli/store.py b/refdata/cli/store.py index ec3a27c..bca7082 100644 --- a/refdata/cli/store.py +++ b/refdata/cli/store.py @@ -98,4 +98,4 @@ def show_dataset(basedir, db, index, raw, key): # Read the index of given. loader = DictLoader(util.read_index(index)) if index is not None else UrlLoader() store = RefStore(basedir=basedir, loader=loader, connect_url=db) - util.print_dataset(dataset=store.open(key), raw=raw) + util.print_dataset(dataset=store.load(key), raw=raw) diff --git a/refdata/dataset/base.py b/refdata/dataset/base.py index ea3b683..6b56029 100644 --- a/refdata/dataset/base.py +++ b/refdata/dataset/base.py @@ -10,7 +10,7 @@ """ from dateutil.parser import isoparse -from typing import Dict, List, Optional, Set, Union +from typing import Dict, IO, List, Optional, Set, Union import datetime import gzip @@ -70,7 +70,7 @@ def __init__( else: raise err.InvalidFormatError("unknown format '{}'".format(parameters.format_type)) - def data_frame(self, columns: Optional[List[str]] = None) -> pd.DataFrame: + def df(self, columns: Optional[List[str]] = None) -> pd.DataFrame: """Load dataset as a pandas data frame. This is a shortcut to load all (or a given selection of) columns in @@ -193,3 +193,17 @@ def mapping( rhs = rhs if isinstance(rhs, list) else [rhs] consumer = MappingGenerator(split_at=len(lhs), ignore_equal=ignore_equal) return self.load(columns=lhs + rhs, consumer=consumer).to_mapping() + + def open(self) -> IO: + """Open the downloaded data file for the dataset. + + Returns + ------- + file-like object + """ + # Open the file depending on whether it is compressed or not. By now, + # we only support gzip compression. + if self.compression == 'gzip': + return gzip.open(self.datafile, 'rt') + else: + return open(self.datafile, 'rt') diff --git a/refdata/store/base.py b/refdata/store/base.py index 7ce0b15..68d4c8a 100644 --- a/refdata/store/base.py +++ b/refdata/store/base.py @@ -11,10 +11,9 @@ from pooch.core import stream_download from pooch.downloaders import choose_downloader -from typing import Dict, List, Optional, Set, Union +from typing import IO, List, Optional import os -import pandas as pd from refdata.base import DatasetDescriptor from refdata.dataset.base import DatasetHandle @@ -117,30 +116,6 @@ def _datafile(self, dataset_id: str) -> str: """ return os.path.abspath(os.path.join(self.basedir, '{}.{}'.format(dataset_id, 'dat'))) - def distinct( - self, key: str, columns: Optional[Union[str, List[str]]] = None, - auto_download: Optional[bool] = None - ) -> Set: - """Shortcut to get the set of distinct values in one or more columns - for a downloaded dataset with the given identifier. - - Parameters - ---------- - key: string - External unique dataset identifier. - columns: list of string, default=None - Column identifier defining the content and returned distinct value - set. - auto_download: bool, default=None - Override the class global auto download flag. - - Returns - ------- - set - """ - dataset = self.open(key=key, auto_download=auto_download) - return dataset.distinct(columns=columns) - def download(self, key: str) -> DatasetHandle: """Download the dataset with the given (external) identifier. @@ -196,7 +171,7 @@ def download(self, key: str) -> DatasetHandle: package_version=self.package_version ) session.add(dataset) - return self.open(key=key) + return self.load(key=key) def _get(self, session: SessionScope, key: str) -> Dataset: """Get the database object for the dataset with the given key. If @@ -233,70 +208,13 @@ def list(self) -> List[DatasetHandle]: ) for d in datasets ] - def load( - self, key: str, columns: Optional[List[str]] = None, - auto_download: Optional[bool] = None - ) -> pd.DataFrame: - """Load the dataset with the given identifier as a pandas data frame. - - This is a shortcut to open the dataset with the given identifier (and - optionally download it first) and then reading data from the downloaded - file into a data frame. - - Parameters - ---------- - key: string - External unique dataset identifier. - columns: list of string, default=None - Column identifier defining the content and the schema of the - returned data frame. - auto_download: bool, default=None - Override the class global auto download flag. - - Returns - ------- - pd.DataFrame - """ - dataset = self.open(key=key, auto_download=auto_download) - return dataset.data_frame(columns=columns) - - def mapping( - self, key: str, lhs: Union[str, List[str]], rhs: Union[str, List[str]], - ignore_equal: Optional[bool] = True, auto_download: Optional[bool] = None - ) -> Dict: - """Generate a mapping from values in dataset rows. - - This is a shortcut to open the dataset with the given identifier (and - optionally download it first) and the generae a mapping from the - downloaded dataset for the given columns. - - Parameters - ---------- - key: string - External unique dataset identifier. - lhs: string or list of string - Columns defining the source of values for the left-hand side of the - mapping. - rhs: string or list of string - Columns defining the source of values for the right-hand side of the - mapping. - ignore_equal: bool, default=True - Exclude mappings from a value to itself from the created mapping. - auto_download: bool, default=None - Override the class global auto download flag. - - Returns - ------- - set - """ - dataset = self.open(key=key, auto_download=auto_download) - return dataset.mapping(lhs=lhs, rhs=rhs, ignore_equal=ignore_equal) + def load(self, key: str, auto_download: Optional[bool] = None) -> DatasetHandle: + """Get handle for the specified dataset. - def open(self, key: str, auto_download: Optional[bool] = None) -> DatasetHandle: - """Get handle for the specified dataset. If the dataset does not exist - in the local store it will be downloaded if the given auto_download - flag is True or if the class global auto_download flag is True. Note - that the auto_download argument will override the class global one. + If the dataset does not exist in the local store it will be downloaded + if the `auto_download` flag argument is True or if the class global + `auto_download` flag is True. Note that the `auto_download` argument + will override the class global one. If the dataset is not available in the local store (and not automatically downloaded) an error is raised. @@ -337,6 +255,32 @@ def open(self, key: str, auto_download: Optional[bool] = None) -> DatasetHandle: else: raise err.NotDownloadedError(key=key) + def open( + self, key: str, columns: Optional[List[str]] = None, + auto_download: Optional[bool] = None + ) -> IO: + """Open the dataset with the given identifier for reading. + + Returns a file-like object to read the dataset content. This is a + shortcut to open the dataset with the given identifier (and optionally + download it first). + + Parameters + ---------- + key: string + External unique dataset identifier. + columns: list of string, default=None + Column identifier defining the content and the schema of the + returned data frame. + auto_download: bool, default=None + Override the class global auto download flag. + + Returns + ------- + file-like object + """ + return self.load(key=key, auto_download=auto_download).open() + def remove(self, key: str) -> bool: """Remove the dataset with the given (external) identifier from the local store. Returns True if the dataset was removed and False if the diff --git a/tests/store/test_dataset_load.py b/tests/store/test_dataset_load.py index 86d38b1..d876ec9 100644 --- a/tests/store/test_dataset_load.py +++ b/tests/store/test_dataset_load.py @@ -12,10 +12,10 @@ def test_load_distinct(store): """Test downloading the U.S. cities test dataset and getting a list of distinct state names via the local data store. """ - values = store.distinct(key='cities', columns='state', auto_download=True) + values = store.load(key='cities', auto_download=True).distinct(columns='state') assert len(values) == 1 assert 'Alabama' in values - values = store.distinct(key='cities', auto_download=False) + values = store.load(key='cities').distinct() assert len(values) == 7 @@ -23,10 +23,10 @@ def test_load_data_frame(store): """Test downloading and loading the U.S. cities test dataset via the local data store. """ - df = store.load(key='cities', auto_download=True) + df = store.load(key='cities', auto_download=True).df() assert df.shape == (7, 2) assert list(df.columns) == ['city', 'state'] - df = store.load(key='cities', columns=['city']) + df = store.load(key='cities').df(columns=['city']) assert df.shape == (7, 1) assert list(df.columns) == ['city'] @@ -35,10 +35,19 @@ def test_load_mapping(store): """Test downloading the U.S. cities test dataset and getting a mapping of values for columns from the downloaded dataset. """ - mapping = store.mapping(key='cities', lhs='city', rhs='state', auto_download=True) + mapping = store.load(key='cities', auto_download=True).mapping(lhs='city', rhs='state') assert len(mapping) == 7 assert mapping['Troy'] == 'Alabama' - values = store.mapping(key='cities', lhs='city', rhs=['city']) + values = store.load(key='cities').mapping(lhs='city', rhs=['city']) assert len(values) == 0 - values = store.mapping(key='cities', lhs='city', rhs=['city'], ignore_equal=False) + values = store.load(key='cities').mapping(lhs='city', rhs=['city'], ignore_equal=False) assert len(values) == 7 + + +def test_read_dataset(store): + """Test reading the content of a downloaded dataset file.""" + with store.open(key='cities', auto_download=True) as f: + linecount = 0 + for line in f: + linecount += 1 + assert linecount == 8 diff --git a/tests/store/test_local_store.py b/tests/store/test_local_store.py index 6fc6cb6..cda7da3 100644 --- a/tests/store/test_local_store.py +++ b/tests/store/test_local_store.py @@ -7,6 +7,7 @@ """Unit tests for the local datastore.""" +import json import os import pytest @@ -53,6 +54,20 @@ def test_listing_dataset_in_local_store(store): assert 'countries' in datasets +def test_load_dataset(store): + """Test opening a downloaded dataset.""" + store.download(key='cities') + assert store.load('cities').identifier == 'cities' + # Error when opening a dataset that has not been downloaded and is not + # downloaded automatically. + with pytest.raises(err.NotDownloadedError): + store.load('countries') + with pytest.raises(err.NotDownloadedError): + store.load('countries', auto_download=False) + # The dataset can be opened if the auto_download flag is True. + assert store.load('countries', auto_download=True).identifier == 'countries' + + def test_local_store_init(tmpdir): """Test different scenarios for database creation when initializing the local store. @@ -109,18 +124,11 @@ def test_local_store_repo_manager(mock_response, tmpdir): assert len(store.repository().find()) == 0 -def test_open_dataset(store): - """Test opening a downloaded dataset.""" - store.download(key='cities') - assert store.open('cities').identifier == 'cities' - # Error when opening a dataset that has not been downloaded and is not - # downloaded automatically. - with pytest.raises(err.NotDownloadedError): - store.open('countries') - with pytest.raises(err.NotDownloadedError): - store.open('countries', auto_download=False) - # The dataset can be opened if the auto_download flag is True. - assert store.open('countries', auto_download=True).identifier == 'countries' +def test_read_dataset(store): + """Test reading a dataset using the open() method.""" + with store.open(key='countries', auto_download=True) as f: + doc = json.load(f) + assert len(doc) == 2 def test_remove_dataset(store): From 9a719f65a7ce13e3cd9de2e2decd6b2731280b4e Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Tue, 16 Feb 2021 16:29:51 -0500 Subject: [PATCH 09/11] Add optional transformer to dataset distinct function (#4) --- refdata/dataset/base.py | 17 ++++++++++++++--- refdata/dataset/consumer.py | 29 +++++++++++++++++++++-------- tests/dataset/test_consumer.py | 8 ++++++++ tests/store/test_dataset_load.py | 3 +++ 4 files changed, 46 insertions(+), 11 deletions(-) diff --git a/refdata/dataset/base.py b/refdata/dataset/base.py index 6b56029..ea73f8b 100644 --- a/refdata/dataset/base.py +++ b/refdata/dataset/base.py @@ -10,7 +10,7 @@ """ from dateutil.parser import isoparse -from typing import Dict, IO, List, Optional, Set, Union +from typing import Callable, Dict, IO, List, Optional, Set, Union import datetime import gzip @@ -93,7 +93,10 @@ def df(self, columns: Optional[List[str]] = None) -> pd.DataFrame: consumer = DataFrameGenerator(columns=columns) return self.load(columns=columns, consumer=consumer).to_df() - def distinct(self, columns: Optional[Union[str, List[str]]] = None) -> Set: + def distinct( + self, columns: Optional[Union[str, List[str]]] = None, + transformer: Optional[Callable] = None + ) -> Set: """Get the set of distinct values from the specified column(s) in the dataset. @@ -104,11 +107,18 @@ def distinct(self, columns: Optional[Union[str, List[str]]] = None) -> Set: If more than one column is specified the elements in the returned set are tuples of values. + If the optional transformer is given it will be evaluated on the individual + values that are extracted from the columns before adding them to the set + of unique values. + Parameters ---------- columns: string or list of string, default=None Column identifier defining the values that are added to the generated set of distinct values. + transformer: callable, default=None + Optional transformer function that is evaluated on column values + before adding them to the set of distinct values. Returns ------- @@ -119,7 +129,8 @@ def distinct(self, columns: Optional[Union[str, List[str]]] = None) -> Set: columns = columns if columns is not None else [c.identifier for c in self.columns] # Ensure that columns are a list. columns = columns if isinstance(columns, list) else [columns] - return self.load(columns=columns, consumer=DistinctSetGenerator()).to_set() + consumer = DistinctSetGenerator(transformer=transformer) + return self.load(columns=columns, consumer=consumer).to_set() @property def filesize(self) -> int: diff --git a/refdata/dataset/consumer.py b/refdata/dataset/consumer.py index 5651230..cb1819a 100644 --- a/refdata/dataset/consumer.py +++ b/refdata/dataset/consumer.py @@ -10,7 +10,7 @@ """ from abc import ABCMeta, abstractmethod -from typing import Any, Dict, List, Optional, Set +from typing import Any, Callable, Dict, List, Optional, Set import pandas as pd @@ -114,9 +114,18 @@ class DistinctSetGenerator(DataConsumer): If the loader is reading multiple columns, a tuple of values is generated for each row before adding it to the set of distinct values. """ - def __init__(self): - """Initialize the empty set of distinct values.""" + def __init__(self, transformer: Optional[Callable] = None): + """Initialize the empty set of distinct values and the optional value + transformer. + + Parameters + ---------- + transformer: callable, default=None + Optional transformer function that is evaluated on column values + before adding them to the set of distinct values. + """ self.values = set() + self.transformer = transformer def consume(self, row: List): """Add the given row to the internal set of distinct values. @@ -132,7 +141,7 @@ def consume(self, row: List): List of column values for row in a dataset that is being read by a dataset loader. """ - self.values.add(to_value(row)) + self.values.add(to_value(row=row, transformer=self.transformer)) def to_set(self) -> Set: """Get the set of distinct values that has been created by the consumer @@ -219,10 +228,13 @@ def to_mapping(self) -> Dict: # -- Helper Functions --------------------------------------------------------- -def to_value(row: List) -> Any: - """Convert a given list of values into a scalar value or tuple. If the given - list contains a single element that element is returned. Otherwise, a tuple - of the values in the list is returned. +def to_value(row: List, transformer: Optional[Callable] = None) -> Any: + """Convert a given list of values into a scalar value or tuple. + + If the given list contains a single element that element is returned. + Otherwise, a tuple of the values in the list is returned. + + The optional tranformer is applied to all list values individually. Parameters ---------- @@ -233,4 +245,5 @@ def to_value(row: List) -> Any: ------- any """ + row = row if transformer is None else list(map(transformer, row)) return row[0] if len(row) == 1 else tuple(row) diff --git a/tests/dataset/test_consumer.py b/tests/dataset/test_consumer.py index 8b6ca4e..6e2cf7d 100644 --- a/tests/dataset/test_consumer.py +++ b/tests/dataset/test_consumer.py @@ -10,6 +10,7 @@ import pytest from refdata.dataset.consumer import DataFrameGenerator, DistinctSetGenerator, MappingGenerator +from refdata.dataset.consumer import to_value # List of rows for test purposes. @@ -78,3 +79,10 @@ def test_mapping_generator(): assert mapping[('alice', 'smith')] == 23 assert mapping[('alice', 'jones')] == 25 assert mapping[('bob', 'jackson')] == 24 + + +def test_value_transformer(): + """Test transformation of list values with optional transformer function.""" + values = ['A', 'B'] + assert to_value(row=values) == ('A', 'B') + assert to_value(row=values, transformer=str.lower) == ('a', 'b') diff --git a/tests/store/test_dataset_load.py b/tests/store/test_dataset_load.py index d876ec9..b7b4b59 100644 --- a/tests/store/test_dataset_load.py +++ b/tests/store/test_dataset_load.py @@ -15,6 +15,9 @@ def test_load_distinct(store): values = store.load(key='cities', auto_download=True).distinct(columns='state') assert len(values) == 1 assert 'Alabama' in values + values = store.load(key='cities').distinct(columns='state', transformer=str.lower) + assert len(values) == 1 + assert 'alabama' in values values = store.load(key='cities').distinct() assert len(values) == 7 From 06a1548015b799afe2975d83c03d1080eb369be5 Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Tue, 16 Feb 2021 16:45:39 -0500 Subject: [PATCH 10/11] Add optional transformer to dataset mapping function (#4) --- refdata/dataset/base.py | 18 ++++++++++++++++-- refdata/dataset/consumer.py | 24 ++++++++++++++++++++---- tests/store/test_dataset_load.py | 15 +++++++++++---- 3 files changed, 47 insertions(+), 10 deletions(-) diff --git a/refdata/dataset/base.py b/refdata/dataset/base.py index ea73f8b..971a7ef 100644 --- a/refdata/dataset/base.py +++ b/refdata/dataset/base.py @@ -10,7 +10,7 @@ """ from dateutil.parser import isoparse -from typing import Callable, Dict, IO, List, Optional, Set, Union +from typing import Callable, Dict, IO, List, Optional, Set, Tuple, Union import datetime import gzip @@ -173,6 +173,7 @@ def load(self, columns: List[str], consumer: DataConsumer) -> DataConsumer: def mapping( self, lhs: Union[str, List[str]], rhs: Union[str, List[str]], + transformer: Optional[Union[Callable, Tuple[Callable, Callable]]] = None, ignore_equal: Optional[bool] = True ) -> Dict: """Generate a mapping from values in dataset rows. @@ -184,6 +185,12 @@ def mapping( This is a shortcut to load column values using the mapping generator as the data consumer. + It the optional transformer is given it is evaluated on column values + before adding them to the mapping. If a single callable is given, that + function is evalauated on the lhs and rhs columns. If a 2-tuple of + callables is given, the first function is evalauted on lhs columns and + the second function on rhs. columns. + Parameters ---------- lhs: string or list of string @@ -192,6 +199,9 @@ def mapping( rhs: string or list of string Columns defining the source of values for the right-hand side of the mapping. + transformer: callable or tuple of callable, default=None + Optional transformer function(s) that are evaluated on the values + for lhs and rhs columns before adding them to the mapping. ignore_equal: bool, default=True Exclude mappings from a value to itself from the created mapping. @@ -202,7 +212,11 @@ def mapping( # Ensure that lhs and rhs are lists. lhs = lhs if isinstance(lhs, list) else [lhs] rhs = rhs if isinstance(rhs, list) else [rhs] - consumer = MappingGenerator(split_at=len(lhs), ignore_equal=ignore_equal) + consumer = MappingGenerator( + split_at=len(lhs), + transformer=transformer, + ignore_equal=ignore_equal + ) return self.load(columns=lhs + rhs, consumer=consumer).to_mapping() def open(self) -> IO: diff --git a/refdata/dataset/consumer.py b/refdata/dataset/consumer.py index cb1819a..e6595f7 100644 --- a/refdata/dataset/consumer.py +++ b/refdata/dataset/consumer.py @@ -10,7 +10,7 @@ """ from abc import ABCMeta, abstractmethod -from typing import Any, Callable, Dict, List, Optional, Set +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union import pandas as pd @@ -163,7 +163,11 @@ class MappingGenerator(DataConsumer): mapping. If either side of the mapping involves multiple columns, a tuple of values for these columns is added to the mapping. """ - def __init__(self, split_at: int, ignore_equal: Optional[bool] = True): + def __init__( + self, split_at: int, + transformer: Optional[Union[Callable, Tuple[Callable, Callable]]] = None, + ignore_equal: Optional[bool] = True + ): """Initialize the dictionary for the mapping and the column index that separates the values in the left-hand side of the mapping from those in the right-hand side. @@ -177,11 +181,15 @@ def __init__(self, split_at: int, ignore_equal: Optional[bool] = True): split_at: int Columns index position at which rows are divided into left-hand side and right-hand side of the mapping. + transformer: callable or tuple of callable, default=None + Optional transformer function(s) that are evaluated on the values + for lhs and rhs columns before adding them to the mapping. ignore_equal: bool, default=True Exclude mappings from a value to itself from the created mapping. """ self.mapping = dict() self.split_at = split_at + self.transformer = transformer self.ignore_equal = ignore_equal def consume(self, row: List): @@ -201,8 +209,16 @@ def consume(self, row: List): List of column values for row in a dataset that is being read by a dataset loader. """ - lhs = to_value(row[:self.split_at]) - rhs = to_value(row[self.split_at:]) + # Set transformers for lhs and rhs columns. + transform_lhs, transform_rhs = None, None + if self.transformer is not None: + if isinstance(self.transformer, tuple): + transform_lhs, transform_rhs = self.transformer + else: + transform_lhs = self.transformer + transform_rhs = self.transformer + lhs = to_value(row[:self.split_at], transformer=transform_lhs) + rhs = to_value(row[self.split_at:], transformer=transform_rhs) # Ignore the row id lhs and rhs are equal and ignore_equal flag is True. if lhs == rhs and self.ignore_equal: return diff --git a/tests/store/test_dataset_load.py b/tests/store/test_dataset_load.py index b7b4b59..92764b8 100644 --- a/tests/store/test_dataset_load.py +++ b/tests/store/test_dataset_load.py @@ -41,10 +41,17 @@ def test_load_mapping(store): mapping = store.load(key='cities', auto_download=True).mapping(lhs='city', rhs='state') assert len(mapping) == 7 assert mapping['Troy'] == 'Alabama' - values = store.load(key='cities').mapping(lhs='city', rhs=['city']) - assert len(values) == 0 - values = store.load(key='cities').mapping(lhs='city', rhs=['city'], ignore_equal=False) - assert len(values) == 7 + dataset = store.load(key='cities') + mapping = dataset.mapping(lhs='city', rhs=['city']) + assert len(mapping) == 0 + mapping = dataset.mapping(lhs='city', rhs=['city'], ignore_equal=False) + assert len(mapping) == 7 + mapping = dataset.mapping(lhs='city', rhs='state', transformer=str.lower) + assert len(mapping) == 7 + assert mapping['troy'] == 'alabama' + mapping = dataset.mapping(lhs='city', rhs='state', transformer=(str.lower, str.upper)) + assert len(mapping) == 7 + assert mapping['troy'] == 'ALABAMA' def test_read_dataset(store): From 53b0464aa436e02f8d503b93b343a359370057a8 Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Tue, 16 Feb 2021 16:54:38 -0500 Subject: [PATCH 11/11] Update changelog and example notebook --- changelog.md | 3 ++- docs/examples/Usage Example.ipynb | 21 ++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/changelog.md b/changelog.md index f3eb572..8adfdc4 100644 --- a/changelog.md +++ b/changelog.md @@ -10,8 +10,9 @@ * Use `appdirs.user_cache_dir` as parent directory for the default target directory for downloaded files (\#5). -### 0.2.0 - 2020-02-04 +### 0.2.0 - 2020-02-16 * Repository index loader for different data sources. It is now possible to load the repository index from an Url, a locak file, or directly from a given dictionary. * Support loading index files in Json or YAML format. * Add package information and timestamp for downloaded datasets. +* Add optional value transformers to `distinct()` and `mapping()` methods of the `DatasetHandle` (\#4) diff --git a/docs/examples/Usage Example.ipynb b/docs/examples/Usage Example.ipynb index bdbff79..6e6ab26 100644 --- a/docs/examples/Usage Example.ipynb +++ b/docs/examples/Usage Example.ipynb @@ -116,7 +116,6 @@ "text": [ "Downloaded datasets:\n", "\n", - "> C1 Street Suffix Abbreviations (id=usps:street_abbrev)\n", "> Cities in the U.S. (id=encyclopaedia_britannica:us_cities)\n", "> REST Countries (id=restcountries.eu)\n" ] @@ -157,7 +156,7 @@ "# in the restcountries dataset.\n", "\n", "print('Columns:\\n')\n", - "for col in refstore.open('restcountries.eu').columns:\n", + "for col in refstore.load('restcountries.eu').columns:\n", " print(' {} (id={})'.format(col.name, col.identifier))" ] }, @@ -230,7 +229,7 @@ "\n", "import json\n", "\n", - "print(json.dumps(refstore.open('restcountries.eu').to_dict(), indent=4))" + "print(json.dumps(refstore.load('restcountries.eu').to_dict(), indent=4))" ] }, { @@ -243,7 +242,6 @@ "name": "stdout", "output_type": "stream", "text": [ - "usps:street_abbrev\n", "encyclopaedia_britannica:us_cities\n" ] } @@ -357,7 +355,7 @@ "# which will download the datast if it is no in the local\n", "# store.\n", "\n", - "dataset = refstore.open('encyclopaedia_britannica:us_cities', auto_download=True)\n", + "dataset = refstore.load('encyclopaedia_britannica:us_cities', auto_download=True)\n", "# Alternative shortcut:\n", "# refstore.distinct(key='encyclopaedia_britannica:us_cities', columns='state')\n", "\n", @@ -383,7 +381,7 @@ { "data": { "text/plain": [ - "'Canberra'" + "'CANBERRA'" ] }, "execution_count": 9, @@ -394,13 +392,14 @@ "source": [ "# Get a lookup table (dictionary) that maps the\n", "# ISO 3166-1 3-letter country code to the country's\n", - "# captital city\n", + "# captital city. Convert values from both attributes\n", + "# to upper case before adding them to the mapping.\n", "\n", - "dataset = refstore.open('restcountries.eu', auto_download=True)\n", + "dataset = refstore.load('restcountries.eu', auto_download=True)\n", "# Alternative shortcut:\n", "# refstore.mapping(key='restcountries.eu', lhs='alpha3Code', rhs='capital')\n", "\n", - "mapping = dataset.mapping(lhs='alpha3Code', rhs='capital')\n", + "mapping = dataset.mapping(lhs='alpha3Code', rhs='capital', transformer=str.upper)\n", "\n", "mapping['AUS']" ] @@ -500,11 +499,11 @@ "# Get data frame with country name, 3-letter country code,\n", "# and capital city.\n", "\n", - "dataset = refstore.open('restcountries.eu', auto_download=True)\n", + "dataset = refstore.load('restcountries.eu', auto_download=True)\n", "# Alternative shortcut:\n", "# refstore.load('restcountries.eu', ['name', 'alpha3Code', 'capital'])\n", "\n", - "df = dataset.data_frame(['name', 'alpha3Code', 'capital'])\n", + "df = dataset.df(['name', 'alpha3Code', 'capital'])\n", "\n", "df.head()" ]