Merge pull request #7 from VIDA-NYU/dev-0.2.0

Dev 0.2.0
VIDA-NYU · Feb 16, 2021 · 17c1483 · 17c1483
2 parents 5839b9d + 53b0464
commit 17c1483
Show file tree

Hide file tree

Showing 38 changed files with 1,106 additions and 489 deletions.
diff --git a/changelog.md b/changelog.md
@@ -8,3 +8,11 @@
 ### 0.1.1 - 2020-02-03
 
 * Use `appdirs.user_cache_dir` as parent directory for the default target directory for downloaded files (\#5).
+
+
+### 0.2.0 - 2020-02-16
+
+* Repository index loader for different data sources. It is now possible to load the repository index from an Url, a locak file, or directly from a given dictionary.
+* Support loading index files in Json or YAML format.
+* Add package information and timestamp for downloaded datasets.
+* Add optional value transformers to `distinct()` and `mapping()` methods of the `DatasetHandle` (\#4)
diff --git a/docs/examples/Usage Example.ipynb b/docs/examples/Usage Example.ipynb
@@ -31,9 +31,9 @@
    "source": [
     "# Create an instance of the local data store with default settings.\n",
     "\n",
-    "from refdata.store import LocalStore\n",
+    "from refdata.store import RefStore\n",
     "\n",
-    "refstore = LocalStore()"
+    "refstore = RefStore()"
    ]
   },
   {
@@ -89,51 +89,19 @@
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "('a023b7d5233a4d35a15a11b2ec8b9cfa',\n",
-       " {'id': 'restcountries.eu',\n",
-       "  'name': 'REST Countries',\n",
-       "  'description': 'Information about countries in the world available from the restcountries.eu project.',\n",
-       "  'url': 'https://raw.githubusercontent.com/VIDA-NYU/openclean-reference-data/master/data/restcountries.eu.json',\n",
-       "  'checksum': '5893ebfad649533ac82a0b030a24efdd519f95a8b030a5ac9c7df37e85aad005',\n",
-       "  'webpage': 'https://restcountries.eu/',\n",
-       "  'schema': [{'id': 'name',\n",
-       "    'name': 'Name',\n",
-       "    'description': 'Country name',\n",
-       "    'dtype': 'text'},\n",
-       "   {'id': 'alpha2Code',\n",
-       "    'name': 'Country Code (2-letters)',\n",
-       "    'description': 'ISO 3166-1 2-letter country code',\n",
-       "    'dtype': 'text'},\n",
-       "   {'id': 'alpha3Code',\n",
-       "    'name': 'Country Code (3-letters)',\n",
-       "    'description': 'ISO 3166-1 3-letter country code',\n",
-       "    'dtype': 'text'},\n",
-       "   {'id': 'capital',\n",
-       "    'name': 'Capital',\n",
-       "    'description': 'Capital city',\n",
-       "    'dtype': 'text'},\n",
-       "   {'id': 'region',\n",
-       "    'name': 'Region',\n",
-       "    'description': 'World region',\n",
-       "    'dtype': 'text'},\n",
-       "   {'id': 'subregion',\n",
-       "    'name': 'Sub-Region',\n",
-       "    'description': 'Sub-region within the country region',\n",
-       "    'dtype': 'text'}],\n",
-       "  'format': {'type': 'json', 'parameters': {}}})"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "downloaded dataset restcountries.eu (size 316025 bytes).\n"
+     ]
     }
    ],
    "source": [
     "# Download the restcountries dataset\n",
     "\n",
-    "refstore.download('restcountries.eu')"
+    "dataset = refstore.download('restcountries.eu')\n",
+    "\n",
+    "print('downloaded dataset {} (size {} bytes).'.format(dataset.identifier, dataset.filesize))"
    ]
   },
   {
@@ -148,8 +116,8 @@
      "text": [
       "Downloaded datasets:\n",
       "\n",
-      "> REST Countries (id=restcountries.eu)\n",
-      "> Cities in the U.S. (id=encyclopaedia_britannica:us_cities)\n"
+      "> Cities in the U.S. (id=encyclopaedia_britannica:us_cities)\n",
+      "> REST Countries (id=restcountries.eu)\n"
      ]
     }
    ],
@@ -188,7 +156,7 @@
     "# in the restcountries dataset.\n",
     "\n",
     "print('Columns:\\n')\n",
-    "for col in refstore.open('restcountries.eu').columns:\n",
+    "for col in refstore.load('restcountries.eu').columns:\n",
     "    print('  {} (id={})'.format(col.name, col.identifier))"
    ]
   },
@@ -261,7 +229,7 @@
     "\n",
     "import json\n",
     "\n",
-    "print(json.dumps(refstore.open('restcountries.eu').to_dict(), indent=4))"
+    "print(json.dumps(refstore.load('restcountries.eu').to_dict(), indent=4))"
    ]
   },
   {
@@ -274,7 +242,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[<refdata.base.DatasetDescriptor object at 0x7f89f7d12040>]\n"
+      "encyclopaedia_britannica:us_cities\n"
      ]
     }
    ],
@@ -283,7 +251,8 @@
     "\n",
     "refstore.remove('restcountries.eu')\n",
     "\n",
-    "print(refstore.list())"
+    "for dataset in refstore.list():\n",
+    "    print(dataset.identifier)"
    ]
   },
   {
@@ -386,7 +355,7 @@
     "# which will download the datast if it is no in the local\n",
     "# store.\n",
     "\n",
-    "dataset = refstore.open('encyclopaedia_britannica:us_cities', auto_download=True)\n",
+    "dataset = refstore.load('encyclopaedia_britannica:us_cities', auto_download=True)\n",
     "# Alternative shortcut:\n",
     "# refstore.distinct(key='encyclopaedia_britannica:us_cities', columns='state')\n",
     "\n",
@@ -412,7 +381,7 @@
     {
      "data": {
       "text/plain": [
-       "'Canberra'"
+       "'CANBERRA'"
       ]
      },
      "execution_count": 9,
@@ -423,13 +392,14 @@
    "source": [
     "# Get a lookup table (dictionary) that maps the\n",
     "# ISO 3166-1 3-letter country code to the country's\n",
-    "# captital city\n",
+    "# captital city. Convert values from both attributes\n",
+    "# to upper case before adding them to the mapping.\n",
     "\n",
-    "dataset = refstore.open('restcountries.eu', auto_download=True)\n",
+    "dataset = refstore.load('restcountries.eu', auto_download=True)\n",
     "# Alternative shortcut:\n",
     "# refstore.mapping(key='restcountries.eu', lhs='alpha3Code', rhs='capital')\n",
     "\n",
-    "mapping = dataset.mapping(lhs='alpha3Code', rhs='capital')\n",
+    "mapping = dataset.mapping(lhs='alpha3Code', rhs='capital', transformer=str.upper)\n",
     "\n",
     "mapping['AUS']"
    ]
@@ -529,11 +499,11 @@
     "# Get data frame with country name, 3-letter country code,\n",
     "# and capital city.\n",
     "\n",
-    "dataset = refstore.open('restcountries.eu', auto_download=True)\n",
+    "dataset = refstore.load('restcountries.eu', auto_download=True)\n",
     "# Alternative shortcut:\n",
     "# refstore.load('restcountries.eu', ['name', 'alpha3Code', 'capital'])\n",
     "\n",
-    "df = dataset.data_frame(['name', 'alpha3Code', 'capital'])\n",
+    "df = dataset.df(['name', 'alpha3Code', 'capital'])\n",
     "\n",
     "df.head()"
    ]

diff --git a/refdata/cli/repo.py b/refdata/cli/repo.py
@@ -8,8 +8,11 @@
 """Commands that interact with a repository index."""
 
 import click
+import tableprint as tp
 
-from refdata.repo import RepositoryManager, validate
+from refdata.repo.loader import DictLoader, UrlLoader
+from refdata.repo.manager import RepositoryManager
+from refdata.repo.schema import validate
 
 import refdata.cli.util as util
 
@@ -26,9 +29,24 @@ def cli_repo():
 @click.option('-i', '--index', required=False, help='Repository index file')
 def list_repository(index):
     """List repository index content."""
-    # Read the index of given.
-    doc = util.read_index(index) if index is not None else None
-    util.print_datasets(RepositoryManager(doc=doc).find())
+    # Read the index from the optional file or Url. By default, the index that
+    # is specified in the environment is loaded.
+    loader = DictLoader(util.read_index(index)) if index is not None else UrlLoader()
+    datasets = RepositoryManager(doc=loader.load()).find()
+    headers = ['Identifier', 'Name', 'Description']
+    data = list()
+    # Maintain the maximum with for each columns.
+    widths = [len(h) + 1 for h in headers]
+    # Sort datasets by name before output.
+    for dataset in sorted(datasets, key=lambda d: d.name):
+        desc = dataset.description if dataset.description is not None else ''
+        row = [dataset.identifier, dataset.name, desc]
+        for i in range(len(row)):
+            w = len(row[i]) + 1
+            if w > widths[i]:
+                widths[i] = w
+        data.append(row)
+    tp.table(data, headers=headers, width=widths, style='grid', out=util.TPrinter())
 
 
 @cli_repo.command(name='show')
@@ -37,9 +55,10 @@ def list_repository(index):
 @click.argument('key')
 def show_dataset(index, raw, key):
     """Show dataset descriptor from repository index."""
-    # Read the index of given.
-    doc = util.read_index(index) if index is not None else None
-    util.print_dataset(dataset=RepositoryManager(doc=doc).get(key), raw=raw)
+    # Read the index from the optional file or Url. By default, the index that
+    # is specified in the environment is loaded.
+    loader = DictLoader(util.read_index(index)) if index is not None else UrlLoader()
+    util.print_dataset(dataset=RepositoryManager(doc=loader.load()).get(key), raw=raw)
 
 
 @cli_repo.command(name='validate')

diff --git a/refdata/cli/store.py b/refdata/cli/store.py
@@ -7,10 +7,13 @@
 
 """Commands that interact with the local data store."""
 
+from datasize import DataSize
+
 import click
+import tableprint as tp
 
-from refdata.repo import RepositoryManager
-from refdata.store.base import LocalStore
+from refdata.repo.loader import DictLoader, UrlLoader
+from refdata.store.base import RefStore
 
 import refdata.cli.util as util
 
@@ -31,8 +34,8 @@ def cli_store():
 def download_dataset(basedir, db, index, key):
     """List local store content."""
     # Read the index of given.
-    doc = util.read_index(index) if index is not None else None
-    store = LocalStore(basedir=basedir, repo=RepositoryManager(doc=doc), connect_url=db)
+    loader = DictLoader(util.read_index(index)) if index is not None else UrlLoader()
+    store = RefStore(basedir=basedir, loader=loader, connect_url=db)
     store.download(key)
 
 
@@ -43,9 +46,27 @@ def download_dataset(basedir, db, index, key):
 def list_datasets(basedir, db, index):
     """List local store content."""
     # Read the index of given.
-    doc = util.read_index(index) if index is not None else None
-    store = LocalStore(basedir=basedir, repo=RepositoryManager(doc=doc), connect_url=db)
-    util.print_datasets(store.list())
+    loader = DictLoader(util.read_index(index)) if index is not None else UrlLoader()
+    store = RefStore(basedir=basedir, loader=loader, connect_url=db)
+    datasets = store.list()
+    headers = ['Name', 'Size', 'Downloaded', 'Package']
+    data = list()
+    # Maintain the maximum with for each columns.
+    widths = [len(h) + 1 for h in headers]
+    # Sort datasets by name before output.
+    for dataset in sorted(datasets, key=lambda d: d.name):
+        row = [
+            dataset.identifier,
+            '{:.2a}'.format(DataSize(dataset.filesize)),
+            ' '.join(dataset.created_at.isoformat()[:19].split('T')),
+            '{} {}'.format(dataset.package_name, dataset.package_version)
+        ]
+        for i in range(len(row)):
+            w = len(row[i]) + 1
+            if w > widths[i]:
+                widths[i] = w
+        data.append(row)
+    tp.table(data, headers=headers, width=widths, style='grid', out=util.TPrinter())
 
 
 @cli_store.command(name='remove')
@@ -61,8 +82,8 @@ def remove_dataset(basedir, db, index, force, key):
         msg = "Do you really want to remove dataset '{}'".format(key)
         click.confirm(msg, default=True, abort=True)
     # Read the index of given.
-    doc = util.read_index(index) if index is not None else None
-    store = LocalStore(basedir=basedir, repo=RepositoryManager(doc=doc), connect_url=db)
+    loader = DictLoader(util.read_index(index)) if index is not None else UrlLoader()
+    store = RefStore(basedir=basedir, loader=loader, connect_url=db)
     store.remove(key)
 
 
@@ -75,6 +96,6 @@ def remove_dataset(basedir, db, index, force, key):
 def show_dataset(basedir, db, index, raw, key):
     """Show descriptor for downloaded dataset."""
     # Read the index of given.
-    doc = util.read_index(index) if index is not None else None
-    store = LocalStore(basedir=basedir, repo=RepositoryManager(doc=doc), connect_url=db)
-    util.print_dataset(dataset=store.open(key), raw=raw)
+    loader = DictLoader(util.read_index(index)) if index is not None else UrlLoader()
+    store = RefStore(basedir=basedir, loader=loader, connect_url=db)
+    util.print_dataset(dataset=store.load(key), raw=raw)
diff --git a/refdata/cli/util.py b/refdata/cli/util.py
@@ -9,41 +9,22 @@
 line interface.
 """
 
-from typing import Dict, List
+from typing import Dict
 
 import click
 import json
-import os
 
 from refdata.base import DatasetDescriptor
-from refdata.repo import download_index
+from refdata.repo.loader import FileLoader, UrlLoader
 
 
-def print_datasets(datasets: List[DatasetDescriptor]):
-    """Print a listing of datasets to the console.
+class TPrinter:
+    """Wrapper around `click.echo` for table printing."""
+    def write(self, s):
+        click.echo(s)
 
-    Outputs the identifier, name and description for each dataset in the given
-    list. Datasets are sorted by their name.
-
-    Parameters
-    ----------
-    datasets: list of refdata.base.DatasetDescriptor
-        List of dataset descriptors.
-    """
-    # Compute maximal length of values for the dataset identifier, name and
-    # description. The length values are used to align the output.
-    id_len = max([len(d.identifier) for d in datasets] + [10])
-    name_len = max([len(d.name) for d in datasets] + [4])
-    desc_len = max([len(d.description) for d in datasets if d.description is not None] + [11])
-    # Create the output template with all values left aligned.
-    template = '{:<' + str(id_len) + '} | {:<' + str(name_len) + '} | {:<' + str(desc_len) + '}'
-    click.echo()
-    click.echo(template.format('Identifier', 'Name', 'Description'))
-    click.echo(template.format('-' * id_len, '-' * name_len, '-' * desc_len))
-    # Sort datasets by name before output.
-    for dataset in sorted(datasets, key=lambda d: d.name):
-        desc = dataset.description if dataset.description is not None else ''
-        click.echo(template.format(dataset.identifier, dataset.name, desc))
+    def flush(self):
+        pass
 
 
 def print_dataset(dataset: DatasetDescriptor, raw: bool):
@@ -88,8 +69,11 @@ def print_dataset(dataset: DatasetDescriptor, raw: bool):
 
 
 def read_index(filename: str) -> Dict:
-    """Read a repository index file. The filename may either reference a file
-    on the local file system or is expected to be an Url.
+    """Read a repository index file.
+
+    The filename may either reference a file on the local file system or is
+    expected to be an Url. Attempts to read a file first and then load the
+    Url if an error occured while loading the file.
 
     Parameters
     ----------
@@ -101,8 +85,7 @@ def read_index(filename: str) -> Dict:
     dict
     """
     try:
-        with open(filename, 'r') as f:
-            return json.load(f)
-    except OSError as ex:
-        print(ex)
-    return download_index(url=filename)
+        return FileLoader(filename).load()
+    except (IOError, OSError):
+        pass
+    return UrlLoader(url=filename).load()