From 8d2a9cb1461984ea3ef2b8846efcba0a856994a2 Mon Sep 17 00:00:00 2001 From: Hillel Arnold Date: Sat, 27 Aug 2022 17:24:05 -0400 Subject: [PATCH 1/6] add rac_schemas as git module --- .gitmodules | 3 +++ .travis.yml | 1 + rac_schemas | 1 + 3 files changed, 5 insertions(+) create mode 100644 .gitmodules create mode 160000 rac_schemas diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..0f656fa4 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "rac_schemas"] + path = rac_schemas + url = https://github.com/RockefellerArchiveCenter/rac_schemas.git diff --git a/.travis.yml b/.travis.yml index 438f2979..7e4e7028 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,6 +15,7 @@ env: before_install: - cp ${APPLICATION_NAME}/config.py.example ${APPLICATION_NAME}/config.py - echo "${DOCKER_PASSWORD}" | docker login -u "${DOCKER_USERNAME}" --password-stdin + - git submodule init && git submodule update - docker-compose up -d install: - pip install pre-commit && pre-commit install diff --git a/rac_schemas b/rac_schemas new file mode 160000 index 00000000..78fcfd0f --- /dev/null +++ b/rac_schemas @@ -0,0 +1 @@ +Subproject commit 78fcfd0f5faa4ee5a9457fdd411102783b2f6947 From 195f418c115f97f9bb08c850f005fb07e2805740 Mon Sep 17 00:00:00 2001 From: Hillel Arnold Date: Sat, 27 Aug 2022 17:25:08 -0400 Subject: [PATCH 2/6] update requirements --- requirements.in | 4 ++-- requirements.txt | 22 +++++++++++----------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/requirements.in b/requirements.in index 4c654d3a..b8cca41e 100644 --- a/requirements.in +++ b/requirements.in @@ -1,6 +1,6 @@ ArchivesSnake~=0.9 asterism~=0.9 -Django~=4.0 +Django~=4.0.7 django4-cron~=0.5 djangorestframework~=3.13 ElectronBonder~=1.1 @@ -9,7 +9,7 @@ jsonschema~=4.7 odin~=1.7 psycopg2-binary~=2.9 PyYAML~=6.0 -rac-schemas~=0.30 +rac-schema-validator~=0.1 requests~=2.28 shortuuid~=1.0 uritemplate~=4.1 diff --git a/requirements.txt b/requirements.txt index 3362e39e..d92819bc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,9 +20,9 @@ boltons==21.0.0 # via archivessnake certifi==2022.6.15 # via requests -charset-normalizer==2.1.0 +charset-normalizer==2.1.1 # via requests -django==4.0.6 +django==4.0.7 # via # -r requirements.in # asterism @@ -45,13 +45,13 @@ idna==3.3 # yarl iso-639==0.4.5 # via -r requirements.in -jarowinkler==1.2.0 +jarowinkler==1.2.1 # via rapidfuzz -jsonschema==4.9.0 +jsonschema==4.14.0 # via # -r requirements.in - # rac-schemas -more-itertools==8.13.0 + # rac-schema-validator +more-itertools==8.14.0 # via archivessnake multidict==6.0.2 # via yarl @@ -65,16 +65,16 @@ psycopg2-binary==2.9.3 # asterism pyrsistent==0.18.1 # via jsonschema -pytz==2022.1 +pytz==2022.2.1 # via djangorestframework pyyaml==6.0 # via # -r requirements.in # archivessnake # vcrpy -rac-schemas==0.30 +rac-schema-validator==0.1 # via -r requirements.in -rapidfuzz==2.4.2 +rapidfuzz==2.6.0 # via archivessnake requests==2.28.1 # via @@ -94,11 +94,11 @@ structlog==22.1.0 # via archivessnake uritemplate==4.1.1 # via -r requirements.in -urllib3==1.26.11 +urllib3==1.26.12 # via requests vcrpy==4.2.0 # via -r requirements.in wrapt==1.14.1 # via vcrpy -yarl==1.7.2 +yarl==1.8.1 # via vcrpy From 924d76f40c80f0fdd2e62dc5ce1d99cf675d1a19 Mon Sep 17 00:00:00 2001 From: Hillel Arnold Date: Sat, 27 Aug 2022 17:26:03 -0400 Subject: [PATCH 3/6] update configs for schema location and files --- pisces/config.py.deploy | 6 ++++++ pisces/config.py.example | 6 ++++++ pisces/settings.py | 9 +++++++++ 3 files changed, 21 insertions(+) diff --git a/pisces/config.py.deploy b/pisces/config.py.deploy index a788b40f..c65b56e9 100644 --- a/pisces/config.py.deploy +++ b/pisces/config.py.deploy @@ -30,3 +30,9 @@ MOVING_IMAGE_REFS = ${MOVING_IMAGE_REFS} AUDIO_REFS = ${AUDIO_REFS} PHOTOGRAPH_REFS = ${PHOTOGRAPH_REFS} ASSET_BASEURL = "${ASSET_BASEURL}" +SCHEMAS_BASE_DIR = "${SCHEMAS_BASE_DIR}" +BASE_SCHEMA = "${BASE_SCHEMA}" +AGENT_SCHEMA = "${AGENT_SCHEMA}" +COLLECTION_SCHEMA = "${COLLECTION_SCHEMA}" +OBJECT_SCHEMA = "${OBJECT_SCHEMA}" +TERM_SCHEMA = "${TERM_SCHEMA}" diff --git a/pisces/config.py.example b/pisces/config.py.example index 4446d3c8..c6955328 100644 --- a/pisces/config.py.example +++ b/pisces/config.py.example @@ -30,3 +30,9 @@ MOVING_IMAGE_REFS = [] # ArchivesSpace URIs (for example "/subjects/42") for co AUDIO_REFS = ["/subjects/42"] # ArchivesSpace URIs (for example "/subjects/42") for controlled terms which refer to audio materials (list of strings) PHOTOGRAPH_REFS = [] # ArchivesSpace URIs (for example "/subjects/42") for controlled terms which refer to photographic materials (list of strings) ASSET_BASEURL = "https://iiif.rockarch.org" # base URL for IIIF image assets, used to check whether or not assets are available online (string) +SCHEMAS_BASE_DIR = "rac_schemas/schemas" # The base path at which schemas are stored. If this path is relative it will be prefixed with the application's base directory (string) +BASE_SCHEMA = "base.json" # Filename for optional base schema into which other schemas for transformed objects will be resolved (string) +AGENT_SCHEMA = "agent.json" # Filename for schema against which transformed agents are validated (string) +COLLECTION_SCHEMA = "collection.json" # Filename for schema against which transformed collections are validated (string) +OBJECT_SCHEMA = "object.json" # Filename for schema against which transformed objects are validated (string) +TERM_SCHEMA = "term.json" # Filename for schema against which transformed terms are validated (string) diff --git a/pisces/settings.py b/pisces/settings.py index 2e93a039..3c733802 100644 --- a/pisces/settings.py +++ b/pisces/settings.py @@ -153,6 +153,15 @@ DJANGO_CRON_LOCK_BACKEND = "django_cron.backends.lock.file.FileLock" DJANGO_CRON_LOCKFILE_PATH = config.DJANGO_CRON_LOCKFILE_PATH +SCHEMAS = { + "base_dir": os.path.join(BASE_DIR, config.SCHEMAS_BASE_DIR), + "base": config.BASE_SCHEMA, + "agent": config.AGENT_SCHEMA, + "collection": config.COLLECTION_SCHEMA, + "object": config.OBJECT_SCHEMA, + "term": config.TERM_SCHEMA, +} + ARCHIVESSPACE = { "baseurl": config.AS_BASEURL, "username": config.AS_USERNAME, From 9cbfe37c1d6812b5bd3366d8e9e32b70cb2beff8 Mon Sep 17 00:00:00 2001 From: Hillel Arnold Date: Sat, 27 Aug 2022 17:27:01 -0400 Subject: [PATCH 4/6] load schemas from configurable location, use rac-schema-validator --- transformer/tests.py | 32 +++++++++++++++++++++++++++++++- transformer/transformers.py | 33 +++++++++++++++++++++++---------- 2 files changed, 54 insertions(+), 11 deletions(-) diff --git a/transformer/tests.py b/transformer/tests.py index c4054500..851e3c5d 100644 --- a/transformer/tests.py +++ b/transformer/tests.py @@ -3,11 +3,13 @@ import random from unittest.mock import patch -from django.test import TestCase +from django.conf import settings +from django.test import TestCase, override_settings from django.urls import reverse from rest_framework.test import APIRequestFactory from fetcher.helpers import identifier_from_uri +from pisces import config from .cron import CheckMissingOnlineAssets from .mappings import has_online_instance, strip_tags @@ -259,3 +261,31 @@ def test_ping(self): def test_strip_tags(self): for input in ["a collection", "a collection", "a collection"]: self.assertEqual('a collection', strip_tags(input)) + + @patch('transformer.transformers.is_valid') + def test_validate_transformed(self, mock_is_valid): + with override_settings(SCHEMAS={ + "base_dir": os.path.join(settings.BASE_DIR, config.SCHEMAS_BASE_DIR), + "base": config.BASE_SCHEMA, + "agent": config.AGENT_SCHEMA, + "collection": config.COLLECTION_SCHEMA, + "object": config.OBJECT_SCHEMA, + "term": config.TERM_SCHEMA, }): + Transformer().validate_transformed({}, "object.json") + arg_values = mock_is_valid.call_args[0] + self.assertTrue(isinstance(arg_values[0], dict)) + self.assertTrue(isinstance(arg_values[1], dict)) + self.assertTrue(isinstance(arg_values[2], dict)) + + mock_is_valid.reset_mock() + with override_settings(SCHEMAS={ + "base_dir": os.path.join(settings.BASE_DIR, config.SCHEMAS_BASE_DIR), + "agent": config.AGENT_SCHEMA, + "collection": config.COLLECTION_SCHEMA, + "object": config.OBJECT_SCHEMA, + "term": config.TERM_SCHEMA, }): + Transformer().validate_transformed({}, "object.json") + arg_values = mock_is_valid.call_args[0] + self.assertTrue(isinstance(arg_values[0], dict)) + self.assertTrue(isinstance(arg_values[1], dict)) + self.assertEqual(arg_values[2], None) diff --git a/transformer/transformers.py b/transformer/transformers.py index c207cd76..f1b62180 100644 --- a/transformer/transformers.py +++ b/transformer/transformers.py @@ -1,8 +1,10 @@ import json +from os.path import join +from django.conf import settings from jsonschema.exceptions import ValidationError from odin.codecs import json_codec -from rac_schemas import is_valid +from rac_schema_validator import is_valid from .mappings import (SourceAgentCorporateEntityToAgent, SourceAgentFamilyToAgent, SourceAgentPersonToAgent, @@ -34,11 +36,11 @@ class Transformer: def run(self, object_type, data): try: self.identifier = data.get("uri") - from_resource, mapping, schema = self.get_mapping_classes(object_type) + from_resource, mapping, schema_name = self.get_mapping_classes(object_type) transformed = self.get_transformed_object(data, from_resource, mapping) online_pending = self.get_online_pending( data.get("instances", []), transformed.get("online", False)) - is_valid(transformed, schema) + self.validate_transformed(transformed, schema_name) self.save_validated(transformed, online_pending) return transformed except ValidationError as e: @@ -48,13 +50,13 @@ def run(self, object_type, data): def get_mapping_classes(self, object_type): TYPE_MAP = { - "agent_person": (SourceAgentPerson, SourceAgentPersonToAgent, "agent.json"), - "agent_corporate_entity": (SourceAgentCorporateEntity, SourceAgentCorporateEntityToAgent, "agent.json"), - "agent_family": (SourceAgentFamily, SourceAgentFamilyToAgent, "agent.json"), - "resource": (SourceResource, SourceResourceToCollection, "collection.json"), - "archival_object": (SourceArchivalObject, SourceArchivalObjectToObject, "object.json"), - "archival_object_collection": (SourceArchivalObject, SourceArchivalObjectToCollection, "collection.json"), - "subject": (SourceSubject, SourceSubjectToTerm, "term.json") + "agent_person": (SourceAgentPerson, SourceAgentPersonToAgent, settings.SCHEMAS["agent"]), + "agent_corporate_entity": (SourceAgentCorporateEntity, SourceAgentCorporateEntityToAgent, settings.SCHEMAS["agent"]), + "agent_family": (SourceAgentFamily, SourceAgentFamilyToAgent, settings.SCHEMAS["agent"]), + "resource": (SourceResource, SourceResourceToCollection, settings.SCHEMAS["collection"]), + "archival_object": (SourceArchivalObject, SourceArchivalObjectToObject, settings.SCHEMAS["object"]), + "archival_object_collection": (SourceArchivalObject, SourceArchivalObjectToCollection, settings.SCHEMAS["collection"]), + "subject": (SourceSubject, SourceSubjectToTerm, settings.SCHEMAS["term"]) } return TYPE_MAP[object_type] @@ -92,6 +94,17 @@ def remove_keys_from_dict(self, data, target_key="$"): return data return modified_dict + def validate_transformed(self, data, schema_name): + """Validates an object againse the specified schema.""" + base_schema = None + if settings.SCHEMAS.get("base"): + base_file = open(join(settings.SCHEMAS['base_dir'], settings.SCHEMAS['base']), 'r') + base_schema = json.load(base_file) + base_file.close() + with open(join(settings.SCHEMAS['base_dir'], schema_name), 'r') as object_file: + object_schema = json.load(object_file) + is_valid(data, object_schema, base_schema) + def save_validated(self, data, online_pending): es_id = data["uri"].split("/")[-1] try: From 50c5ffe36357c4bf6096990a0e1ffdf7ff69ad03 Mon Sep 17 00:00:00 2001 From: Hillel Arnold Date: Tue, 27 Jun 2023 13:39:11 -0400 Subject: [PATCH 5/6] add rac_schema_validator and rac_schemas as submodule --- .gitmodules | 3 +++ .travis.yml | 3 ++- rac_schemas | 1 + requirements.in | 2 +- requirements.txt | 14 +++++++------- 5 files changed, 14 insertions(+), 9 deletions(-) create mode 100644 .gitmodules create mode 160000 rac_schemas diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..8c044f3d --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "rac_schemas"] + path = rac_schemas + url = https://github.com/RockefellerArchiveCenter/rac_schemas diff --git a/.travis.yml b/.travis.yml index 438f2979..7e97583e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,4 @@ -dist: bionic +dist: focal language: python python: "3.10" cache: @@ -15,6 +15,7 @@ env: before_install: - cp ${APPLICATION_NAME}/config.py.example ${APPLICATION_NAME}/config.py - echo "${DOCKER_PASSWORD}" | docker login -u "${DOCKER_USERNAME}" --password-stdin + - git submodule init && git submodule update - docker-compose up -d install: - pip install pre-commit && pre-commit install diff --git a/rac_schemas b/rac_schemas new file mode 160000 index 00000000..da4f3e85 --- /dev/null +++ b/rac_schemas @@ -0,0 +1 @@ +Subproject commit da4f3e85ebbdb33de422c615030f8e3417035a14 diff --git a/requirements.in b/requirements.in index 7919f28c..794c8f9d 100644 --- a/requirements.in +++ b/requirements.in @@ -9,7 +9,7 @@ jsonschema~=4.7 odin~=1.7 psycopg2~=2.9 PyYAML~=6.0 -rac-schemas~=0.30 +rac-schema-validator~=0.1 requests~=2.28 shortuuid~=1.0 uritemplate~=4.1 diff --git a/requirements.txt b/requirements.txt index a5cb85fb..97d3e4f3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,7 +22,7 @@ certifi==2023.5.7 # via requests charset-normalizer==3.1.0 # via requests -django==4.2.1 +django==4.2.2 # via # -r requirements.in # asterism @@ -48,7 +48,7 @@ iso-639==0.4.5 jsonschema==4.17.3 # via # -r requirements.in - # rac-schemas + # rac-schema-validator more-itertools==9.1.0 # via archivessnake multidict==6.0.4 @@ -66,9 +66,9 @@ pyyaml==6.0 # -r requirements.in # archivessnake # vcrpy -rac-schemas==0.30 +rac-schema-validator==0.1 # via -r requirements.in -rapidfuzz==3.0.0 +rapidfuzz==3.1.1 # via archivessnake requests==2.31.0 # via @@ -86,13 +86,13 @@ sqlparse==0.4.4 # via django structlog==23.1.0 # via archivessnake -typing-extensions==4.6.2 +typing-extensions==4.6.3 # via asgiref uritemplate==4.1.1 # via -r requirements.in -urllib3==2.0.2 +urllib3==2.0.3 # via requests -vcrpy==4.3.1 +vcrpy==4.4.0 # via -r requirements.in wrapt==1.15.0 # via vcrpy From 622401b93d1a0f348df0e10266f17389f8b1eb1e Mon Sep 17 00:00:00 2001 From: Hillel Arnold Date: Tue, 27 Jun 2023 13:40:24 -0400 Subject: [PATCH 6/6] use rac_schema_validator --- transformer/transformers.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/transformer/transformers.py b/transformer/transformers.py index dc968fe5..26ac73c2 100644 --- a/transformer/transformers.py +++ b/transformer/transformers.py @@ -1,8 +1,9 @@ import json +from os.path import join +import rac_schema_validator from jsonschema.exceptions import ValidationError from odin.codecs import json_codec -from rac_schemas import is_valid from .mappings import (SourceAgentCorporateEntityToAgent, SourceAgentFamilyToAgent, SourceAgentPersonToAgent, @@ -38,7 +39,7 @@ def run(self, object_type, data): transformed = self.get_transformed_object(data, from_resource, mapping) online_pending = self.get_online_pending( data.get("instances", []), transformed.get("online", False)) - is_valid(transformed, schema) + self.validate_transformed(transformed, schema) self.save_validated(transformed, online_pending) return transformed except ValidationError as e: @@ -93,6 +94,15 @@ def remove_keys_from_dict(self, data, target_key="$"): return data return modified_dict + def validate_transformed(self, data, schema_name): + """Validates transformed data against RAC schemas.""" + base_file = open(join('rac_schemas', 'schemas', 'base.json'), 'r') + base_schema = json.load(base_file) + base_file.close() + with open(join('rac_schemas', 'schemas', schema_name), 'r') as object_file: + object_schema = json.load(object_file) + return rac_schema_validator.is_valid(data, object_schema, base_schema) + def save_validated(self, data, online_pending): es_id = data["uri"].split("/")[-1] try: