Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CSW harvester OutputSchema config support #258 #259

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
8 changes: 6 additions & 2 deletions ckanext/spatial/harvesters/csw.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,10 +159,14 @@ def fetch_stage(self,harvest_object):
self._save_object_error('Error contacting the CSW server: %s' % e,
harvest_object)
return False


# load config
self._set_source_config(harvest_object.source.config)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you document the new output_schema option and its default value in here so others are aware of it?

https://github.com/ckan/ckanext-spatial/blob/master/doc/harvesters.rst

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added fallback to default in case the server is not supporting iso19139 -> 19115 transformation
the fallback will log and switch back to default asking for iso19139 -> iso19139.

# get output_schema from config
namespace = self.source_config.get('output_schema',self.output_schema())
identifier = harvest_object.guid
try:
record = self.csw.getrecordbyid([identifier], outputschema=self.output_schema())
record = self.csw.getrecordbyid([identifier], outputschema=namespace)
except Exception as e:
self._save_object_error('Error getting the CSW record with GUID %s' % identifier, harvest_object)
return False
Expand Down
59 changes: 47 additions & 12 deletions ckanext/spatial/lib/csw_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,36 @@ class CswService(OwsService):
def __init__(self, endpoint=None):
super(CswService, self).__init__(endpoint)
self.sortby = SortBy([SortProperty('dc:identifier')])
# check capabilities
_cap = self.getcapabilities(endpoint)['response']
self.capabilities=etree.ElementTree(etree.fromstring(_cap))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please try to follow PEP8 guidelines, specially spacing between = and , :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry, I can't validate the whole project and my code editor is not helping me, good catch, I'll try to fix my bad.


def _get_output_schemas(self, operation):
_cap_ns = self.capabilities.getroot().nsmap
_ows_ns = _cap_ns.get('ows')
if not _ows_ns:
raise CswError('Bad getcapabilities response: OWS namespace not found '+str(_cap_ns))
_op=self.capabilities.find("//{{{}}}Operation[@name='{}']".format(_ows_ns,operation))
_schemas=_op.find("{{{}}}Parameter[@name='outputSchema']".format(_ows_ns))
_values = map(lambda v: v.text, _schemas.findall("{{{}}}Value".format(_ows_ns)))
output_schemas={}
for key, value in _schemas.nsmap.items():
if value in _values:
output_schemas.update({key:value})
return output_schemas

def getrecords(self, qtype=None, keywords=[],
typenames="csw:Record", esn="brief",
skip=0, count=10, outputschema="gmd", **kw):
from owslib.csw import namespaces

constraints = []
csw = self._ows(**kw)

# fetch target csw server capabilities for requested output schema
output_schemas=self._get_output_schemas('GetRecords')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we move this call to the __init__() method to avoid duplication and multiple calls to GetCapabilities?
Something like:

def __init__(self, endpoint=None):
    _cap = self.getcapabilities(endpoint)['response']
    self.capabilities = etree.ElementTree(etree.fromstring(_cap))
    self.output_schemas = {
        'GetRecords': self._get_output_schemas('GetRecords'),
        'GetRecordById': self._get_output_schemas('GetRecordById'),
    }

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

if not output_schemas.get(outputschema):
raise CswError('Output schema \'{}\' not supported by target server: '.format(output_schemas))

if qtype is not None:
constraints.append(PropertyIsEqualTo("dc:type", qtype))

Expand All @@ -87,7 +109,7 @@ def getrecords(self, qtype=None, keywords=[],
"esn": esn,
"startposition": skip,
"maxrecords": count,
"outputschema": namespaces[outputschema],
"outputschema": output_schemas[outputschema],
"sortby": self.sortby
}
log.info('Making CSW request: getrecords2 %r', kwa)
Expand All @@ -102,10 +124,15 @@ def getrecords(self, qtype=None, keywords=[],
def getidentifiers(self, qtype=None, typenames="csw:Record", esn="brief",
keywords=[], limit=None, page=10, outputschema="gmd",
startposition=0, cql=None, **kw):
from owslib.csw import namespaces

constraints = []
csw = self._ows(**kw)

# fetch target csw server capabilities for requested output schema
output_schemas=self._get_output_schemas('GetRecords')
if not output_schemas.get(outputschema):
raise CswError('Output schema \'{}\' not supported by target server: '.format(output_schemas))

if qtype is not None:
constraints.append(PropertyIsEqualTo("dc:type", qtype))

Expand All @@ -115,7 +142,7 @@ def getidentifiers(self, qtype=None, typenames="csw:Record", esn="brief",
"esn": esn,
"startposition": startposition,
"maxrecords": page,
"outputschema": namespaces[outputschema],
"outputschema": output_schemas[outputschema],
"cql": cql,
"sortby": self.sortby
}
Expand All @@ -129,7 +156,6 @@ def getidentifiers(self, qtype=None, typenames="csw:Record", esn="brief",
err = 'Error getting identifiers: %r' % \
csw.exceptionreport.exceptions
#log.error(err)
raise CswError(err)

if matches == 0:
matches = csw.results['matches']
Expand All @@ -154,11 +180,17 @@ def getidentifiers(self, qtype=None, typenames="csw:Record", esn="brief",
kwa["startposition"] = startposition

def getrecordbyid(self, ids=[], esn="full", outputschema="gmd", **kw):
from owslib.csw import namespaces

csw = self._ows(**kw)

# fetch target csw server capabilities for requested output schema
output_schemas=self._get_output_schemas('GetRecordById')
if not output_schemas.get(outputschema):
raise CswError('Output schema \'{}\' not supported by target server: '.format(output_schemas))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably here I should be more tolerant Logging ERROR and returning.


kwa = {
"esn": esn,
"outputschema": namespaces[outputschema],
"outputschema": output_schemas[outputschema],
}
# Ordinary Python version's don't support the metadata argument
log.info('Making CSW request: getrecordbyid %r %r', ids, kwa)
Expand All @@ -168,14 +200,17 @@ def getrecordbyid(self, ids=[], esn="full", outputschema="gmd", **kw):
csw.exceptionreport.exceptions
#log.error(err)
raise CswError(err)
if not csw.records:
elif csw.records:
record = self._xmd(list(csw.records.values())[0])
elif csw.response:
record = self._xmd(etree.fromstring(csw.response))
else:
return
record = self._xmd(list(csw.records.values())[0])

## strip off the enclosing results container, we only want the metadata
#md = csw._exml.find("/gmd:MD_Metadata")#, namespaces=namespaces)
# Ordinary Python version's don't support the metadata argument
md = csw._exml.find("/{http://www.isotc211.org/2005/gmd}MD_Metadata")
# '/{schema}*' expression should be safe enough and is able to match the
# desired schema followed by both MD_Metadata or MI_Metadata (iso19115[-2])
md = csw._exml.find("/{{{schema}}}*".format(schema=output_schemas[outputschema]))
mdtree = etree.ElementTree(md)
try:
record["xml"] = etree.tostring(mdtree, pretty_print=True, encoding=str)
Expand Down