Skip to content

Commit

Permalink
Merge pull request #569 from RockefellerArchiveCenter/development
Browse files Browse the repository at this point in the history
Updates validation implementation
  • Loading branch information
helrond authored Jun 27, 2023
2 parents 1e406b1 + 545222d commit 20684ef
Show file tree
Hide file tree
Showing 10 changed files with 89 additions and 20 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "rac_schemas"]
path = rac_schemas
url = https://github.com/RockefellerArchiveCenter/rac_schemas
3 changes: 2 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
dist: bionic
dist: focal
language: python
python: "3.10"
cache:
Expand All @@ -15,6 +15,7 @@ env:
before_install:
- cp ${APPLICATION_NAME}/config.py.example ${APPLICATION_NAME}/config.py
- echo "${DOCKER_PASSWORD}" | docker login -u "${DOCKER_USERNAME}" --password-stdin
- git submodule init && git submodule update
- docker-compose up -d
install:
- pip install pre-commit && pre-commit install
Expand Down
6 changes: 6 additions & 0 deletions pisces/config.py.deploy
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@ MOVING_IMAGE_REFS = ${MOVING_IMAGE_REFS}
AUDIO_REFS = ${AUDIO_REFS}
PHOTOGRAPH_REFS = ${PHOTOGRAPH_REFS}
ASSET_BASEURL = "${ASSET_BASEURL}"
SCHEMAS_BASE_DIR = "${SCHEMAS_BASE_DIR}"
BASE_SCHEMA = "${BASE_SCHEMA}"
AGENT_SCHEMA = "${AGENT_SCHEMA}"
COLLECTION_SCHEMA = "${COLLECTION_SCHEMA}"
OBJECT_SCHEMA = "${OBJECT_SCHEMA}"
TERM_SCHEMA = "${TERM_SCHEMA}"
NOTIFY_EMAIL = ${NOTIFY_EMAIL}
NOTIFY_TEAMS = ${NOTIFY_TEAMS}
TEAMS_URL = "${TEAMS_URL}"
6 changes: 6 additions & 0 deletions pisces/config.py.example
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@ MOVING_IMAGE_REFS = [] # ArchivesSpace URIs (for example "/subjects/42") for co
AUDIO_REFS = ["/subjects/42"] # ArchivesSpace URIs (for example "/subjects/42") for controlled terms which refer to audio materials (list of strings)
PHOTOGRAPH_REFS = [] # ArchivesSpace URIs (for example "/subjects/42") for controlled terms which refer to photographic materials (list of strings)
ASSET_BASEURL = "https://iiif.rockarch.org" # base URL for IIIF image assets, used to check whether or not assets are available online (string)
SCHEMAS_BASE_DIR = "rac_schemas/schemas" # The base path at which schemas are stored. If this path is relative it will be prefixed with the application's base directory (string)
BASE_SCHEMA = "base.json" # Filename for optional base schema into which other schemas for transformed objects will be resolved (string)
AGENT_SCHEMA = "agent.json" # Filename for schema against which transformed agents are validated (string)
COLLECTION_SCHEMA = "collection.json" # Filename for schema against which transformed collections are validated (string)
OBJECT_SCHEMA = "object.json" # Filename for schema against which transformed objects are validated (string)
TERM_SCHEMA = "term.json" # Filename for schema against which transformed terms are validated (string)
NOTIFY_EMAIL = True # deliver error messages via email (boolean)
NOTIFY_TEAMS = False # deliver error message via Microsoft Teams (boolean)
TEAMS_URL = "https://teams-url.com" # URL for Incoming Webhook Connector in Microsoft Teams Channel
9 changes: 9 additions & 0 deletions pisces/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,15 @@
DJANGO_CRON_LOCK_BACKEND = "django_cron.backends.lock.file.FileLock"
DJANGO_CRON_LOCKFILE_PATH = config.DJANGO_CRON_LOCKFILE_PATH

SCHEMAS = {
"base_dir": os.path.join(BASE_DIR, config.SCHEMAS_BASE_DIR),
"base": config.BASE_SCHEMA,
"agent": config.AGENT_SCHEMA,
"collection": config.COLLECTION_SCHEMA,
"object": config.OBJECT_SCHEMA,
"term": config.TERM_SCHEMA,
}

ARCHIVESSPACE = {
"baseurl": config.AS_BASEURL,
"username": config.AS_USERNAME,
Expand Down
1 change: 1 addition & 0 deletions rac_schemas
Submodule rac_schemas added at da4f3e
2 changes: 1 addition & 1 deletion requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jsonschema~=4.7
odin~=1.7
psycopg2~=2.9
PyYAML~=6.0
rac-schemas~=0.30
rac-schema-validator~=0.1
requests~=2.28
shortuuid~=1.0
uritemplate~=4.1
Expand Down
14 changes: 7 additions & 7 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ certifi==2023.5.7
# via requests
charset-normalizer==3.1.0
# via requests
django==4.2.1
django==4.2.2
# via
# -r requirements.in
# asterism
Expand All @@ -48,7 +48,7 @@ iso-639==0.4.5
jsonschema==4.17.3
# via
# -r requirements.in
# rac-schemas
# rac-schema-validator
more-itertools==9.1.0
# via archivessnake
multidict==6.0.4
Expand All @@ -66,9 +66,9 @@ pyyaml==6.0
# -r requirements.in
# archivessnake
# vcrpy
rac-schemas==0.30
rac-schema-validator==0.1
# via -r requirements.in
rapidfuzz==3.0.0
rapidfuzz==3.1.1
# via archivessnake
requests==2.31.0
# via
Expand All @@ -86,13 +86,13 @@ sqlparse==0.4.4
# via django
structlog==23.1.0
# via archivessnake
typing-extensions==4.6.2
typing-extensions==4.6.3
# via asgiref
uritemplate==4.1.1
# via -r requirements.in
urllib3==2.0.2
urllib3==2.0.3
# via requests
vcrpy==4.3.1
vcrpy==4.4.0
# via -r requirements.in
wrapt==1.15.0
# via vcrpy
Expand Down
32 changes: 31 additions & 1 deletion transformer/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
import random
from unittest.mock import patch

from django.test import TestCase
from django.conf import settings
from django.test import TestCase, override_settings
from django.urls import reverse
from rest_framework.test import APIRequestFactory

from fetcher.helpers import identifier_from_uri
from pisces import config

from .cron import CheckMissingOnlineAssets
from .mappings import has_online_instance, strip_tags
Expand Down Expand Up @@ -260,3 +262,31 @@ def test_ping(self):
def test_strip_tags(self):
for input in ["<title>a collection</title>", "a <a href='https://example.com'>collection</a>", "a collection"]:
self.assertEqual('a collection', strip_tags(input))

@patch('transformer.transformers.is_valid')
def test_validate_transformed(self, mock_is_valid):
with override_settings(SCHEMAS={
"base_dir": os.path.join(settings.BASE_DIR, config.SCHEMAS_BASE_DIR),
"base": config.BASE_SCHEMA,
"agent": config.AGENT_SCHEMA,
"collection": config.COLLECTION_SCHEMA,
"object": config.OBJECT_SCHEMA,
"term": config.TERM_SCHEMA, }):
Transformer().validate_transformed({}, "object.json")
arg_values = mock_is_valid.call_args[0]
self.assertTrue(isinstance(arg_values[0], dict))
self.assertTrue(isinstance(arg_values[1], dict))
self.assertTrue(isinstance(arg_values[2], dict))

mock_is_valid.reset_mock()
with override_settings(SCHEMAS={
"base_dir": os.path.join(settings.BASE_DIR, config.SCHEMAS_BASE_DIR),
"agent": config.AGENT_SCHEMA,
"collection": config.COLLECTION_SCHEMA,
"object": config.OBJECT_SCHEMA,
"term": config.TERM_SCHEMA, }):
Transformer().validate_transformed({}, "object.json")
arg_values = mock_is_valid.call_args[0]
self.assertTrue(isinstance(arg_values[0], dict))
self.assertTrue(isinstance(arg_values[1], dict))
self.assertEqual(arg_values[2], None)
33 changes: 23 additions & 10 deletions transformer/transformers.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import json
from os.path import join

from django.conf import settings
from jsonschema.exceptions import ValidationError
from odin.codecs import json_codec
from rac_schemas import is_valid
from rac_schema_validator import is_valid

from .mappings import (SourceAgentCorporateEntityToAgent,
SourceAgentFamilyToAgent, SourceAgentPersonToAgent,
Expand Down Expand Up @@ -34,11 +36,11 @@ class Transformer:
def run(self, object_type, data):
try:
self.identifier = data.get("uri")
from_resource, mapping, schema = self.get_mapping_classes(object_type)
from_resource, mapping, schema_name = self.get_mapping_classes(object_type)
transformed = self.get_transformed_object(data, from_resource, mapping)
online_pending = self.get_online_pending(
data.get("instances", []), transformed.get("online", False))
is_valid(transformed, schema)
self.validate_transformed(transformed, schema_name)
self.save_validated(transformed, online_pending)
return transformed
except ValidationError as e:
Expand All @@ -48,13 +50,13 @@ def run(self, object_type, data):

def get_mapping_classes(self, object_type):
TYPE_MAP = {
"agent_person": (SourceAgentPerson, SourceAgentPersonToAgent, "agent.json"),
"agent_corporate_entity": (SourceAgentCorporateEntity, SourceAgentCorporateEntityToAgent, "agent.json"),
"agent_family": (SourceAgentFamily, SourceAgentFamilyToAgent, "agent.json"),
"resource": (SourceResource, SourceResourceToCollection, "collection.json"),
"archival_object": (SourceArchivalObject, SourceArchivalObjectToObject, "object.json"),
"archival_object_collection": (SourceArchivalObject, SourceArchivalObjectToCollection, "collection.json"),
"subject": (SourceSubject, SourceSubjectToTerm, "term.json")
"agent_person": (SourceAgentPerson, SourceAgentPersonToAgent, settings.SCHEMAS["agent"]),
"agent_corporate_entity": (SourceAgentCorporateEntity, SourceAgentCorporateEntityToAgent, settings.SCHEMAS["agent"]),
"agent_family": (SourceAgentFamily, SourceAgentFamilyToAgent, settings.SCHEMAS["agent"]),
"resource": (SourceResource, SourceResourceToCollection, settings.SCHEMAS["collection"]),
"archival_object": (SourceArchivalObject, SourceArchivalObjectToObject, settings.SCHEMAS["object"]),
"archival_object_collection": (SourceArchivalObject, SourceArchivalObjectToCollection, settings.SCHEMAS["collection"]),
"subject": (SourceSubject, SourceSubjectToTerm, settings.SCHEMAS["term"])
}
return TYPE_MAP[object_type]

Expand Down Expand Up @@ -93,6 +95,17 @@ def remove_keys_from_dict(self, data, target_key="$"):
return data
return modified_dict

def validate_transformed(self, data, schema_name):
"""Validates an object againse the specified schema."""
base_schema = None
if settings.SCHEMAS.get("base"):
base_file = open(join(settings.SCHEMAS['base_dir'], settings.SCHEMAS['base']), 'r')
base_schema = json.load(base_file)
base_file.close()
with open(join(settings.SCHEMAS['base_dir'], schema_name), 'r') as object_file:
object_schema = json.load(object_file)
is_valid(data, object_schema, base_schema)

def save_validated(self, data, online_pending):
es_id = data["uri"].split("/")[-1]
try:
Expand Down

0 comments on commit 20684ef

Please sign in to comment.