Skip to content

Commit

Permalink
Merge pull request #7 from VIDA-NYU/dev-0.2.0
Browse files Browse the repository at this point in the history
Dev 0.2.0
  • Loading branch information
heikomuller authored Feb 16, 2021
2 parents 5839b9d + 53b0464 commit 17c1483
Show file tree
Hide file tree
Showing 38 changed files with 1,106 additions and 489 deletions.
8 changes: 8 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,11 @@
### 0.1.1 - 2020-02-03

* Use `appdirs.user_cache_dir` as parent directory for the default target directory for downloaded files (\#5).


### 0.2.0 - 2020-02-16

* Repository index loader for different data sources. It is now possible to load the repository index from an Url, a locak file, or directly from a given dictionary.
* Support loading index files in Json or YAML format.
* Add package information and timestamp for downloaded datasets.
* Add optional value transformers to `distinct()` and `mapping()` methods of the `DatasetHandle` (\#4)
80 changes: 25 additions & 55 deletions docs/examples/Usage Example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@
"source": [
"# Create an instance of the local data store with default settings.\n",
"\n",
"from refdata.store import LocalStore\n",
"from refdata.store import RefStore\n",
"\n",
"refstore = LocalStore()"
"refstore = RefStore()"
]
},
{
Expand Down Expand Up @@ -89,51 +89,19 @@
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('a023b7d5233a4d35a15a11b2ec8b9cfa',\n",
" {'id': 'restcountries.eu',\n",
" 'name': 'REST Countries',\n",
" 'description': 'Information about countries in the world available from the restcountries.eu project.',\n",
" 'url': 'https://raw.githubusercontent.com/VIDA-NYU/openclean-reference-data/master/data/restcountries.eu.json',\n",
" 'checksum': '5893ebfad649533ac82a0b030a24efdd519f95a8b030a5ac9c7df37e85aad005',\n",
" 'webpage': 'https://restcountries.eu/',\n",
" 'schema': [{'id': 'name',\n",
" 'name': 'Name',\n",
" 'description': 'Country name',\n",
" 'dtype': 'text'},\n",
" {'id': 'alpha2Code',\n",
" 'name': 'Country Code (2-letters)',\n",
" 'description': 'ISO 3166-1 2-letter country code',\n",
" 'dtype': 'text'},\n",
" {'id': 'alpha3Code',\n",
" 'name': 'Country Code (3-letters)',\n",
" 'description': 'ISO 3166-1 3-letter country code',\n",
" 'dtype': 'text'},\n",
" {'id': 'capital',\n",
" 'name': 'Capital',\n",
" 'description': 'Capital city',\n",
" 'dtype': 'text'},\n",
" {'id': 'region',\n",
" 'name': 'Region',\n",
" 'description': 'World region',\n",
" 'dtype': 'text'},\n",
" {'id': 'subregion',\n",
" 'name': 'Sub-Region',\n",
" 'description': 'Sub-region within the country region',\n",
" 'dtype': 'text'}],\n",
" 'format': {'type': 'json', 'parameters': {}}})"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
"name": "stdout",
"output_type": "stream",
"text": [
"downloaded dataset restcountries.eu (size 316025 bytes).\n"
]
}
],
"source": [
"# Download the restcountries dataset\n",
"\n",
"refstore.download('restcountries.eu')"
"dataset = refstore.download('restcountries.eu')\n",
"\n",
"print('downloaded dataset {} (size {} bytes).'.format(dataset.identifier, dataset.filesize))"
]
},
{
Expand All @@ -148,8 +116,8 @@
"text": [
"Downloaded datasets:\n",
"\n",
"> REST Countries (id=restcountries.eu)\n",
"> Cities in the U.S. (id=encyclopaedia_britannica:us_cities)\n"
"> Cities in the U.S. (id=encyclopaedia_britannica:us_cities)\n",
"> REST Countries (id=restcountries.eu)\n"
]
}
],
Expand Down Expand Up @@ -188,7 +156,7 @@
"# in the restcountries dataset.\n",
"\n",
"print('Columns:\\n')\n",
"for col in refstore.open('restcountries.eu').columns:\n",
"for col in refstore.load('restcountries.eu').columns:\n",
" print(' {} (id={})'.format(col.name, col.identifier))"
]
},
Expand Down Expand Up @@ -261,7 +229,7 @@
"\n",
"import json\n",
"\n",
"print(json.dumps(refstore.open('restcountries.eu').to_dict(), indent=4))"
"print(json.dumps(refstore.load('restcountries.eu').to_dict(), indent=4))"
]
},
{
Expand All @@ -274,7 +242,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"[<refdata.base.DatasetDescriptor object at 0x7f89f7d12040>]\n"
"encyclopaedia_britannica:us_cities\n"
]
}
],
Expand All @@ -283,7 +251,8 @@
"\n",
"refstore.remove('restcountries.eu')\n",
"\n",
"print(refstore.list())"
"for dataset in refstore.list():\n",
" print(dataset.identifier)"
]
},
{
Expand Down Expand Up @@ -386,7 +355,7 @@
"# which will download the datast if it is no in the local\n",
"# store.\n",
"\n",
"dataset = refstore.open('encyclopaedia_britannica:us_cities', auto_download=True)\n",
"dataset = refstore.load('encyclopaedia_britannica:us_cities', auto_download=True)\n",
"# Alternative shortcut:\n",
"# refstore.distinct(key='encyclopaedia_britannica:us_cities', columns='state')\n",
"\n",
Expand All @@ -412,7 +381,7 @@
{
"data": {
"text/plain": [
"'Canberra'"
"'CANBERRA'"
]
},
"execution_count": 9,
Expand All @@ -423,13 +392,14 @@
"source": [
"# Get a lookup table (dictionary) that maps the\n",
"# ISO 3166-1 3-letter country code to the country's\n",
"# captital city\n",
"# captital city. Convert values from both attributes\n",
"# to upper case before adding them to the mapping.\n",
"\n",
"dataset = refstore.open('restcountries.eu', auto_download=True)\n",
"dataset = refstore.load('restcountries.eu', auto_download=True)\n",
"# Alternative shortcut:\n",
"# refstore.mapping(key='restcountries.eu', lhs='alpha3Code', rhs='capital')\n",
"\n",
"mapping = dataset.mapping(lhs='alpha3Code', rhs='capital')\n",
"mapping = dataset.mapping(lhs='alpha3Code', rhs='capital', transformer=str.upper)\n",
"\n",
"mapping['AUS']"
]
Expand Down Expand Up @@ -529,11 +499,11 @@
"# Get data frame with country name, 3-letter country code,\n",
"# and capital city.\n",
"\n",
"dataset = refstore.open('restcountries.eu', auto_download=True)\n",
"dataset = refstore.load('restcountries.eu', auto_download=True)\n",
"# Alternative shortcut:\n",
"# refstore.load('restcountries.eu', ['name', 'alpha3Code', 'capital'])\n",
"\n",
"df = dataset.data_frame(['name', 'alpha3Code', 'capital'])\n",
"df = dataset.df(['name', 'alpha3Code', 'capital'])\n",
"\n",
"df.head()"
]
Expand Down
33 changes: 26 additions & 7 deletions refdata/cli/repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@
"""Commands that interact with a repository index."""

import click
import tableprint as tp

from refdata.repo import RepositoryManager, validate
from refdata.repo.loader import DictLoader, UrlLoader
from refdata.repo.manager import RepositoryManager
from refdata.repo.schema import validate

import refdata.cli.util as util

Expand All @@ -26,9 +29,24 @@ def cli_repo():
@click.option('-i', '--index', required=False, help='Repository index file')
def list_repository(index):
"""List repository index content."""
# Read the index of given.
doc = util.read_index(index) if index is not None else None
util.print_datasets(RepositoryManager(doc=doc).find())
# Read the index from the optional file or Url. By default, the index that
# is specified in the environment is loaded.
loader = DictLoader(util.read_index(index)) if index is not None else UrlLoader()
datasets = RepositoryManager(doc=loader.load()).find()
headers = ['Identifier', 'Name', 'Description']
data = list()
# Maintain the maximum with for each columns.
widths = [len(h) + 1 for h in headers]
# Sort datasets by name before output.
for dataset in sorted(datasets, key=lambda d: d.name):
desc = dataset.description if dataset.description is not None else ''
row = [dataset.identifier, dataset.name, desc]
for i in range(len(row)):
w = len(row[i]) + 1
if w > widths[i]:
widths[i] = w
data.append(row)
tp.table(data, headers=headers, width=widths, style='grid', out=util.TPrinter())


@cli_repo.command(name='show')
Expand All @@ -37,9 +55,10 @@ def list_repository(index):
@click.argument('key')
def show_dataset(index, raw, key):
"""Show dataset descriptor from repository index."""
# Read the index of given.
doc = util.read_index(index) if index is not None else None
util.print_dataset(dataset=RepositoryManager(doc=doc).get(key), raw=raw)
# Read the index from the optional file or Url. By default, the index that
# is specified in the environment is loaded.
loader = DictLoader(util.read_index(index)) if index is not None else UrlLoader()
util.print_dataset(dataset=RepositoryManager(doc=loader.load()).get(key), raw=raw)


@cli_repo.command(name='validate')
Expand Down
45 changes: 33 additions & 12 deletions refdata/cli/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,13 @@

"""Commands that interact with the local data store."""

from datasize import DataSize

import click
import tableprint as tp

from refdata.repo import RepositoryManager
from refdata.store.base import LocalStore
from refdata.repo.loader import DictLoader, UrlLoader
from refdata.store.base import RefStore

import refdata.cli.util as util

Expand All @@ -31,8 +34,8 @@ def cli_store():
def download_dataset(basedir, db, index, key):
"""List local store content."""
# Read the index of given.
doc = util.read_index(index) if index is not None else None
store = LocalStore(basedir=basedir, repo=RepositoryManager(doc=doc), connect_url=db)
loader = DictLoader(util.read_index(index)) if index is not None else UrlLoader()
store = RefStore(basedir=basedir, loader=loader, connect_url=db)
store.download(key)


Expand All @@ -43,9 +46,27 @@ def download_dataset(basedir, db, index, key):
def list_datasets(basedir, db, index):
"""List local store content."""
# Read the index of given.
doc = util.read_index(index) if index is not None else None
store = LocalStore(basedir=basedir, repo=RepositoryManager(doc=doc), connect_url=db)
util.print_datasets(store.list())
loader = DictLoader(util.read_index(index)) if index is not None else UrlLoader()
store = RefStore(basedir=basedir, loader=loader, connect_url=db)
datasets = store.list()
headers = ['Name', 'Size', 'Downloaded', 'Package']
data = list()
# Maintain the maximum with for each columns.
widths = [len(h) + 1 for h in headers]
# Sort datasets by name before output.
for dataset in sorted(datasets, key=lambda d: d.name):
row = [
dataset.identifier,
'{:.2a}'.format(DataSize(dataset.filesize)),
' '.join(dataset.created_at.isoformat()[:19].split('T')),
'{} {}'.format(dataset.package_name, dataset.package_version)
]
for i in range(len(row)):
w = len(row[i]) + 1
if w > widths[i]:
widths[i] = w
data.append(row)
tp.table(data, headers=headers, width=widths, style='grid', out=util.TPrinter())


@cli_store.command(name='remove')
Expand All @@ -61,8 +82,8 @@ def remove_dataset(basedir, db, index, force, key):
msg = "Do you really want to remove dataset '{}'".format(key)
click.confirm(msg, default=True, abort=True)
# Read the index of given.
doc = util.read_index(index) if index is not None else None
store = LocalStore(basedir=basedir, repo=RepositoryManager(doc=doc), connect_url=db)
loader = DictLoader(util.read_index(index)) if index is not None else UrlLoader()
store = RefStore(basedir=basedir, loader=loader, connect_url=db)
store.remove(key)


Expand All @@ -75,6 +96,6 @@ def remove_dataset(basedir, db, index, force, key):
def show_dataset(basedir, db, index, raw, key):
"""Show descriptor for downloaded dataset."""
# Read the index of given.
doc = util.read_index(index) if index is not None else None
store = LocalStore(basedir=basedir, repo=RepositoryManager(doc=doc), connect_url=db)
util.print_dataset(dataset=store.open(key), raw=raw)
loader = DictLoader(util.read_index(index)) if index is not None else UrlLoader()
store = RefStore(basedir=basedir, loader=loader, connect_url=db)
util.print_dataset(dataset=store.load(key), raw=raw)
51 changes: 17 additions & 34 deletions refdata/cli/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,41 +9,22 @@
line interface.
"""

from typing import Dict, List
from typing import Dict

import click
import json
import os

from refdata.base import DatasetDescriptor
from refdata.repo import download_index
from refdata.repo.loader import FileLoader, UrlLoader


def print_datasets(datasets: List[DatasetDescriptor]):
"""Print a listing of datasets to the console.
class TPrinter:
"""Wrapper around `click.echo` for table printing."""
def write(self, s):
click.echo(s)

Outputs the identifier, name and description for each dataset in the given
list. Datasets are sorted by their name.
Parameters
----------
datasets: list of refdata.base.DatasetDescriptor
List of dataset descriptors.
"""
# Compute maximal length of values for the dataset identifier, name and
# description. The length values are used to align the output.
id_len = max([len(d.identifier) for d in datasets] + [10])
name_len = max([len(d.name) for d in datasets] + [4])
desc_len = max([len(d.description) for d in datasets if d.description is not None] + [11])
# Create the output template with all values left aligned.
template = '{:<' + str(id_len) + '} | {:<' + str(name_len) + '} | {:<' + str(desc_len) + '}'
click.echo()
click.echo(template.format('Identifier', 'Name', 'Description'))
click.echo(template.format('-' * id_len, '-' * name_len, '-' * desc_len))
# Sort datasets by name before output.
for dataset in sorted(datasets, key=lambda d: d.name):
desc = dataset.description if dataset.description is not None else ''
click.echo(template.format(dataset.identifier, dataset.name, desc))
def flush(self):
pass


def print_dataset(dataset: DatasetDescriptor, raw: bool):
Expand Down Expand Up @@ -88,8 +69,11 @@ def print_dataset(dataset: DatasetDescriptor, raw: bool):


def read_index(filename: str) -> Dict:
"""Read a repository index file. The filename may either reference a file
on the local file system or is expected to be an Url.
"""Read a repository index file.
The filename may either reference a file on the local file system or is
expected to be an Url. Attempts to read a file first and then load the
Url if an error occured while loading the file.
Parameters
----------
Expand All @@ -101,8 +85,7 @@ def read_index(filename: str) -> Dict:
dict
"""
try:
with open(filename, 'r') as f:
return json.load(f)
except OSError as ex:
print(ex)
return download_index(url=filename)
return FileLoader(filename).load()
except (IOError, OSError):
pass
return UrlLoader(url=filename).load()
Loading

0 comments on commit 17c1483

Please sign in to comment.