Skip to content

Commit

Permalink
Merge branch 'root-fields'
Browse files Browse the repository at this point in the history
  • Loading branch information
amercader committed Oct 5, 2018
2 parents 8ecff5d + 25b19b9 commit 5671252
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 13 deletions.
43 changes: 30 additions & 13 deletions ckanext/dcat/harvesters/rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,26 @@ def info(self):
'description': 'Harvester for DCAT datasets from an RDF graph'
}

def _get_dict_value(self, _dict, key, default=None):
'''
Returns the value for the given key on a CKAN dict
By default a key on the root level is checked. If not found, extras
are checked, both with the key provided and with `dcat_` prepended to
support legacy fields.
If not found, returns the default value, which defaults to None
'''

if key in _dict:
return _dict[key]

for extra in _dict.get('extras', []):
if extra['key'] == key or extra['key'] == 'dcat_' + key:
return extra['value']

return default

def _get_guid(self, dataset_dict, source_url=None):
'''
Try to get a unique identifier for a harvested dataset
Expand All @@ -47,23 +67,18 @@ def _get_guid(self, dataset_dict, source_url=None):
Returns None if no guid could be decided.
'''
guid = None
for extra in dataset_dict.get('extras', []):
if extra['key'] == 'uri' and extra['value']:
return extra['value']

for extra in dataset_dict.get('extras', []):
if extra['key'] == 'identifier' and extra['value']:
return extra['value']

for extra in dataset_dict.get('extras', []):
if extra['key'] == 'dcat_identifier' and extra['value']:
return extra['value']
guid = (
self._get_dict_value(dataset_dict, 'uri') or
self._get_dict_value(dataset_dict, 'identifier')
)
if guid:
return guid

if dataset_dict.get('name'):
guid = dataset_dict['name']
if source_url:
guid = source_url.rstrip('/') + '/' + guid

return guid

def _mark_datasets_for_deletion(self, guids_in_source, harvest_job):
Expand Down Expand Up @@ -185,19 +200,21 @@ def gather_stage(self, harvest_job):
return []

try:

source_dataset = model.Package.get(harvest_job.source.id)

for dataset in parser.datasets():
if not dataset.get('name'):
dataset['name'] = self._gen_new_name(dataset['title'])

# Unless already set by the parser, get the owner organization (if any)
# from the harvest source dataset
if not dataset.get('owner_org'):
source_dataset = model.Package.get(harvest_job.source.id)
if source_dataset.owner_org:
dataset['owner_org'] = source_dataset.owner_org

# Try to get a unique identifier for the harvested dataset
guid = self._get_guid(dataset)
guid = self._get_guid(dataset, source_url=source_dataset.url)

if not guid:
self._save_gather_error('Could not get a unique identifier for dataset: {0}'.format(dataset),
Expand Down
22 changes: 22 additions & 0 deletions ckanext/dcat/tests/test_harvester.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,28 @@ def before_create(self, harvest_object, dataset_dict, temp_dict):

class TestDCATHarvestUnit(object):

def test_get_guid_uri_root(self):

dataset = {
'name': 'test-dataset',
'uri': 'http://dataset/uri',
}

guid = DCATRDFHarvester()._get_guid(dataset)

eq_(guid, 'http://dataset/uri')

def test_get_guid_identifier_root(self):

dataset = {
'name': 'test-dataset',
'identifier': 'http://dataset/uri',
}

guid = DCATRDFHarvester()._get_guid(dataset)

eq_(guid, 'http://dataset/uri')

def test_get_guid_uri(self):

dataset = {
Expand Down

0 comments on commit 5671252

Please sign in to comment.