From 2fd5fcaadce080b117b69bffaa31001af261c479 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Sat, 24 Aug 2024 18:35:05 +0200 Subject: [PATCH 01/16] [WIP] Database models to cache the metadata and references --- .gitignore | 1 + dapitains/app/database.py | 97 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 dapitains/app/database.py diff --git a/.gitignore b/.gitignore index 7b6caf3..06fab00 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +app.db # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/dapitains/app/database.py b/dapitains/app/database.py new file mode 100644 index 0000000..2a4a87f --- /dev/null +++ b/dapitains/app/database.py @@ -0,0 +1,97 @@ +try: + from flask_sqlalchemy import SQLAlchemy + from sqlalchemy.ext.mutable import MutableDict + from sqlalchemy.types import TypeDecorator, TEXT + import click +except ImportError: + print("This part of the package can only be imported with the web requirements.") + raise + +import dapitains.metadata.classes as abstracts +import json + +db = SQLAlchemy() + +parent_child_association = db.Table('parent_child_association', + db.Column('parent_id', db.Integer, db.ForeignKey('collections.id'), primary_key=True), + db.Column('child_id', db.Integer, db.ForeignKey('collections.id'), primary_key=True) +) + + +class JSONEncodedDict(TypeDecorator): + """Enables JSON storage by encoding and decoding on the fly.""" + impl = TEXT + + def process_bind_param(self, value, dialect): + if value is None: + return '' + elif isinstance(value, dict): + return json.dumps(value) + return value + + def process_result_value(self, value, dialect): + if value is None: + return '""' + return json.loads(value) + +class Collection(db.Model): + __tablename__ = 'collections' + + id = db.Column(db.Integer, primary_key=True, autoincrement=True) + identifier = db.Column(db.String, nullable=False, unique=True) + title = db.Column(db.String, nullable=False) + description = db.Column(db.String, nullable=True) + resource = db.Column(db.Boolean, default=False) + filepath = db.Column(db.String, nullable=True) + dublin_core = db.Column(MutableDict.as_mutable(JSONEncodedDict), nullable=True) + extensions = db.Column(MutableDict.as_mutable(JSONEncodedDict), nullable=True) + + # One-to-one relationship with Navigation + navigation = db.relationship('Navigation', uselist=False, backref='collection', lazy='noload') + + + parents = db.relationship( + 'Collection', + secondary=parent_child_association, + primaryjoin=id == parent_child_association.c.child_id, + secondaryjoin=id == parent_child_association.c.parent_id, + backref='children' + ) + + @classmethod + def from_class(cls, obj: abstracts.Collection) -> "Collection": + return cls( + identifier=obj.identifier, + title=obj.title, + description=obj.description, + resource=obj.resource, + filepath=obj.filepath, + # We are dumping because it's not read or accessible + dublin_core=json.dumps([dub.json() for dub in obj.dublin_core]), + extensions=json.dumps([ext.json() for ext in obj.extension]) + ) + +class Navigation(db.Model): + __tablename__ = 'navigations' + + id = db.Column(db.Integer, primary_key=True) + collection_id = db.Column(db.Integer, db.ForeignKey('collections.id'), nullable=False, unique=True) + default_tree = db.Column(db.String, nullable=True) + + # JSON fields stored as TEXT + paths = db.Column(MutableDict.as_mutable(JSONEncodedDict), nullable=False, default={}) + references = db.Column(MutableDict.as_mutable(JSONEncodedDict), nullable=False, default={}) + +if __name__ == "__main__": + import flask + import os + app = flask.Flask(__name__) + + basedir = os.path.abspath(os.path.dirname(__file__)) + db_path = os.path.join(basedir, 'app.db') + app.config['SQLALCHEMY_DATABASE_URI'] = f'sqlite:///{db_path}' + app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False + + db.init_app(app) + with app.app_context(): + db.create_all() \ No newline at end of file From 2c618eb631f3e29e9526a32f6d96a3e8d4f9d8ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Sat, 24 Aug 2024 20:34:26 +0200 Subject: [PATCH 02/16] Reference parsing and ingestion --- dapitains/app/database.py | 40 +++++------- dapitains/app/ingest.py | 90 +++++++++++++++++++++++++++ dapitains/tei/citeStructure.py | 4 +- dapitains/tei/{tei.py => document.py} | 0 tests/test_citeStructure.py | 6 +- tests/test_db_create.py | 52 ++++++++++++++++ tests/test_tei.py | 2 +- 7 files changed, 162 insertions(+), 32 deletions(-) create mode 100644 dapitains/app/ingest.py rename dapitains/tei/{tei.py => document.py} (100%) create mode 100644 tests/test_db_create.py diff --git a/dapitains/app/database.py b/dapitains/app/database.py index 2a4a87f..76f7259 100644 --- a/dapitains/app/database.py +++ b/dapitains/app/database.py @@ -1,6 +1,6 @@ try: from flask_sqlalchemy import SQLAlchemy - from sqlalchemy.ext.mutable import MutableDict + from sqlalchemy.ext.mutable import MutableDict, Mutable from sqlalchemy.types import TypeDecorator, TEXT import click except ImportError: @@ -8,8 +8,11 @@ raise import dapitains.metadata.classes as abstracts +from dapitains.metadata.xml_parser import Catalog +from dapitains.tei.document import Document import json + db = SQLAlchemy() parent_child_association = db.Table('parent_child_association', @@ -18,20 +21,19 @@ ) -class JSONEncodedDict(TypeDecorator): +class JSONEncoded(TypeDecorator): """Enables JSON storage by encoding and decoding on the fly.""" impl = TEXT def process_bind_param(self, value, dialect): if value is None: - return '' - elif isinstance(value, dict): + return None + else: return json.dumps(value) - return value def process_result_value(self, value, dialect): if value is None: - return '""' + return None return json.loads(value) class Collection(db.Model): @@ -43,8 +45,8 @@ class Collection(db.Model): description = db.Column(db.String, nullable=True) resource = db.Column(db.Boolean, default=False) filepath = db.Column(db.String, nullable=True) - dublin_core = db.Column(MutableDict.as_mutable(JSONEncodedDict), nullable=True) - extensions = db.Column(MutableDict.as_mutable(JSONEncodedDict), nullable=True) + dublin_core = db.Column(JSONEncoded, nullable=True) + extensions = db.Column(JSONEncoded, nullable=True) # One-to-one relationship with Navigation navigation = db.relationship('Navigation', uselist=False, backref='collection', lazy='noload') @@ -67,8 +69,8 @@ def from_class(cls, obj: abstracts.Collection) -> "Collection": resource=obj.resource, filepath=obj.filepath, # We are dumping because it's not read or accessible - dublin_core=json.dumps([dub.json() for dub in obj.dublin_core]), - extensions=json.dumps([ext.json() for ext in obj.extension]) + dublin_core=[dub.json() for dub in obj.dublin_core], + extensions=[ext.json() for ext in obj.extension] ) class Navigation(db.Model): @@ -79,19 +81,5 @@ class Navigation(db.Model): default_tree = db.Column(db.String, nullable=True) # JSON fields stored as TEXT - paths = db.Column(MutableDict.as_mutable(JSONEncodedDict), nullable=False, default={}) - references = db.Column(MutableDict.as_mutable(JSONEncodedDict), nullable=False, default={}) - -if __name__ == "__main__": - import flask - import os - app = flask.Flask(__name__) - - basedir = os.path.abspath(os.path.dirname(__file__)) - db_path = os.path.join(basedir, 'app.db') - app.config['SQLALCHEMY_DATABASE_URI'] = f'sqlite:///{db_path}' - app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False - - db.init_app(app) - with app.app_context(): - db.create_all() \ No newline at end of file + paths = db.Column(JSONEncoded, nullable=False, default={}) + references = db.Column(JSONEncoded, nullable=False, default={}) diff --git a/dapitains/app/ingest.py b/dapitains/app/ingest.py new file mode 100644 index 0000000..f293997 --- /dev/null +++ b/dapitains/app/ingest.py @@ -0,0 +1,90 @@ +from typing import Dict, List, Optional, Any +from dapitains.app.database import Collection, Navigation, db +from dapitains.metadata.xml_parser import Catalog +from dapitains.tei.document import Document + +def store_catalog(catalog: Catalog): + for identifier, collection in catalog.objects.items(): + db.session.add(Collection.from_class(collection)) + if collection.resource: + doc = Document(collection.filepath) + references = { + key: struct.find_refs(root=doc.xml, structure=struct.units) for key, struct in doc.citeStructure.items() + } + + +def get_member_by_path(data: List[Dict[str, Any]], path: List[int]) -> Optional[Dict[str, Any]]: + """ + Retrieve the member at the specified path in the nested data structure. + + :param data: The nested data structure (list of dictionaries). + :param path: A list of indices that represent the path to the desired member. + :return: The member at the specified path, or None if the path is invalid. + """ + current_level = data + + for index in path: + try: + current_level = current_level[index] + if 'members' in current_level: + current_level = current_level['members'] + except (IndexError, KeyError): + return None + + return current_level + + +def generate_paths(data: List[Dict[str, Any]], path: Optional[List[int]] = None) -> Dict[str, List[int]]: + """ + Generate a dictionary mapping each 'ref' in a nested data structure to its path. + + The path is represented as a list of indices that show how to access each 'ref' + in the nested structure. + + :param data: The nested data structure (list of dictionaries). Each dictionary + can have a 'ref' and/or 'members' key. + :param path: A list of indices representing the current path in the nested data + structure. Used internally for recursion. Defaults to None for the + initial call. + :return: A dictionary where each key is a 'ref' and each value is a list of indices + representing the path to that 'ref' in the nested structure. + """ + if path is None: + path = [] + + paths = {} + + def recurse(items, current_path): + for index, item in enumerate(items): + ref = item.get('ref') + if ref: + # Record the path for the current reference + paths[ref] = current_path + [index] + + members = item.get('members') + if members: + # Recurse into the 'members' list + recurse(members, current_path + [index]) + + recurse(data, []) + return paths + +if __name__ == "__main__": + import flask + import os + from dapitains.metadata.xml_parser import ingest_catalog + app = flask.Flask(__name__) + + basedir = os.path.abspath(os.path.dirname(__file__)) + db_path = os.path.join(basedir, 'app.db') + app.config['SQLALCHEMY_DATABASE_URI'] = f'sqlite:///{db_path}' + app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False + + db.init_app(app) + with app.app_context(): + db.drop_all() + db.create_all() + + catalog, _ = ingest_catalog("/home/thibault/dev/MyDapytains/tests/catalog/example-collection.xml") + + store_catalog(catalog) \ No newline at end of file diff --git a/dapitains/tei/citeStructure.py b/dapitains/tei/citeStructure.py index 79e866d..5767544 100644 --- a/dapitains/tei/citeStructure.py +++ b/dapitains/tei/citeStructure.py @@ -48,14 +48,14 @@ class CitableUnit: dublinCore: Dict[str, List[str]] = field(default_factory=lambda: defaultdict(list)) extension: Dict[str, List[str]] = field(default_factory=lambda: defaultdict(list)) - def to_dts(self): + def json(self): out = { "citeType": self.citeType, "ref": self.ref } if self.children: out["members"] = [ - member.to_dts() + member.json() for member in self.children ] if self.dublinCore: diff --git a/dapitains/tei/tei.py b/dapitains/tei/document.py similarity index 100% rename from dapitains/tei/tei.py rename to dapitains/tei/document.py diff --git a/tests/test_citeStructure.py b/tests/test_citeStructure.py index 8154fd9..e73edd0 100644 --- a/tests/test_citeStructure.py +++ b/tests/test_citeStructure.py @@ -58,7 +58,7 @@ def test_parsing(): # Generate XPath for "Luke 1" (partial match) assert parser.generate_xpath("Luke") == "//body/div[@n='Luke']" - assert [root.to_dts() for root in parser.find_refs(root=TEI, structure=parser.units)] == [ + assert [root.json() for root in parser.find_refs(root=TEI, structure=parser.units)] == [ {'citeType': 'book', 'ref': 'Luke', 'members': [ {'citeType': 'chapter', 'ref': 'Luke 1', 'members': [ {'citeType': 'verse', 'ref': 'Luke 1:1'}, @@ -82,7 +82,7 @@ def test_cite_data(): citeStructure = xpath.evaluate_single("/TEI/teiHeader/refsDecl[1]") parser = CiteStructureParser(citeStructure) refs = parser.find_refs(root=TEI, structure=parser.units) - refs = [ref.to_dts() for ref in refs] + refs = [ref.json() for ref in refs] assert refs == [ {'citeType': 'book', 'ref': '1', 'dublinCore': { 'http://purl.org/dc/terms/title': ['Introduction', 'Introduction'], @@ -106,7 +106,7 @@ def test_advanced_cite_data(): citeStructure = xpath.evaluate_single("/TEI/teiHeader/refsDecl[1]") parser = CiteStructureParser(citeStructure) refs = parser.find_refs(root=TEI, structure=parser.units) - refs = [ref.to_dts() for ref in refs] + refs = [ref.json() for ref in refs] assert refs == [ {'citeType': 'part', 'ref': 'part-1', 'members': [ {'citeType': 'book', 'ref': 'part-1.1', 'dublinCore': { diff --git a/tests/test_db_create.py b/tests/test_db_create.py new file mode 100644 index 0000000..d5d8604 --- /dev/null +++ b/tests/test_db_create.py @@ -0,0 +1,52 @@ +import flask +from dapitains.app.ingest import generate_paths, get_member_by_path +from dapitains.tei.document import Document +import os + + +local_dir = os.path.join(os.path.dirname(__file__)) + + +def test_simple_path(): + """Check that a document can be parsed and that path are corrects""" + doc = Document(f"{local_dir}/tei/multiple_tree.xml") + refs = { + tree: [ref.json() for ref in obj.find_refs(doc.xml, structure=obj.units)] + for tree, obj in doc.citeStructure.items() + } + paths = {tree: generate_paths(ref) for tree, ref in refs.items()} + assert paths == { + 'nums': { + 'I': [0], '1': [1], 'A': [2], '4': [3], 'V': [4] + }, + None: { + 'I': [0], '1': [1], 'A': [2], '4': [3], 'V': [4] + }, + 'alpha': { + 'div-a1': [0], 'div-002': [1], 'div-xyz': [2], 'div-004': [3], 'div-v5': [4] + } + } + # Second part of the test + doc = Document(f"{local_dir}/tei/base_tei.xml") + refs = { + tree: [ref.json() for ref in obj.find_refs(doc.xml, structure=obj.units)] + for tree, obj in doc.citeStructure.items() + } + paths = {tree: generate_paths(ref) for tree, ref in refs.items()} + assert paths == { + None: { + "Luke": [0], + "Luke 1": [0, 0], + "Luke 1:1": [0, 0, 0], + "Luke 1:2": [0, 0, 1], + "Luke 1#1": [0, 0, 2], + "Mark": [1], + "Mark 1": [1, 0], + "Mark 1:1": [1, 0, 0], + "Mark 1:2": [1, 0, 1], + "Mark 1#1": [1, 0, 2], + "Mark 1:3": [1, 0, 3] + } + } + assert get_member_by_path(refs[None], paths[None]["Mark 1:3"]) == {'citeType': 'verse', 'ref': 'Mark 1:3'} + diff --git a/tests/test_tei.py b/tests/test_tei.py index a9f367e..6107027 100644 --- a/tests/test_tei.py +++ b/tests/test_tei.py @@ -1,7 +1,7 @@ import os.path import pytest -from dapitains.tei.tei import Document +from dapitains.tei.document import Document from lxml.etree import tostring local_dir = os.path.join(os.path.dirname(__file__), "tei") From 08b9be75c26c066905e711f825dff80b6c680708 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Sat, 24 Aug 2024 21:44:18 +0200 Subject: [PATCH 03/16] [WIP] Working on the navigation parser --- dapitains/app/ingest.py | 61 +++++++++++++++++++++++++++++++++++++++-- tests/test_db_create.py | 27 +++++++++++++++++- 2 files changed, 84 insertions(+), 4 deletions(-) diff --git a/dapitains/app/ingest.py b/dapitains/app/ingest.py index f293997..91151b8 100644 --- a/dapitains/app/ingest.py +++ b/dapitains/app/ingest.py @@ -1,7 +1,8 @@ -from typing import Dict, List, Optional, Any +from typing import Dict, List, Optional, Any, Tuple from dapitains.app.database import Collection, Navigation, db from dapitains.metadata.xml_parser import Catalog from dapitains.tei.document import Document +import copy def store_catalog(catalog: Catalog): for identifier, collection in catalog.objects.items(): @@ -23,10 +24,12 @@ def get_member_by_path(data: List[Dict[str, Any]], path: List[int]) -> Optional[ """ current_level = data - for index in path: + path_copy = [] + path + while path_copy: + index = path_copy.pop(0) try: current_level = current_level[index] - if 'members' in current_level: + if 'members' in current_level and path_copy: current_level = current_level['members'] except (IndexError, KeyError): return None @@ -34,6 +37,10 @@ def get_member_by_path(data: List[Dict[str, Any]], path: List[int]) -> Optional[ return current_level +def strip_members(obj: Dict[str, Any]) -> Dict[str, Any]: + return {k: v for k, v in obj.items() if k != "members"} + + def generate_paths(data: List[Dict[str, Any]], path: Optional[List[int]] = None) -> Dict[str, List[int]]: """ Generate a dictionary mapping each 'ref' in a nested data structure to its path. @@ -69,6 +76,54 @@ def recurse(items, current_path): recurse(data, []) return paths + +def get_nav( + refs: List[Dict[str, Any]], + paths: Dict[str, List[int]], + start_or_ref: Optional[str], + end: Optional[str], + down: Optional[int] = 1 +) -> Tuple[List[Dict[str, Any]], Optional[Dict[str, Any]], Optional[Dict[str, Any]]]: + + paths_index = list(paths.keys()) + start_index, end_index = None, None + if start_or_ref: + start_index = paths_index.index(start_or_ref) + if end: + end_index = paths_index.index(end) + 1 + + paths = dict(list(paths.items())[start_index:end_index]) + + current_level = [0] + + start_path, end_path = None, None + + if start_or_ref: + start_path = paths[start_or_ref] + current_level.append(len(start_path)) + if end: + end_path = paths[end] + current_level.append(len(end_path)) + + current_level = max(current_level) + + if down == -1: + down = max(list(map(len, paths.values()))) + + if down == 0: + paths = {key: value for key, value in paths.items() if len(value) == current_level} + else: + paths = {key: value for key, value in paths.items() if current_level < len(value) <= down + current_level} + + return ( + [ + strip_members(get_member_by_path(refs, path)) for path in paths.values() + ], + strip_members(get_member_by_path(refs, start_path)) if start_path else None, + strip_members(get_member_by_path(refs, end_path)) if end_path else None + ) + + if __name__ == "__main__": import flask import os diff --git a/tests/test_db_create.py b/tests/test_db_create.py index d5d8604..e8dcdf4 100644 --- a/tests/test_db_create.py +++ b/tests/test_db_create.py @@ -1,5 +1,5 @@ import flask -from dapitains.app.ingest import generate_paths, get_member_by_path +from dapitains.app.ingest import generate_paths, get_member_by_path, get_nav, strip_members from dapitains.tei.document import Document import os @@ -48,5 +48,30 @@ def test_simple_path(): "Mark 1:3": [1, 0, 3] } } + assert strip_members(get_member_by_path(refs[None], paths[None]["Luke"])) == {'citeType': 'book', 'ref': 'Luke'} assert get_member_by_path(refs[None], paths[None]["Mark 1:3"]) == {'citeType': 'verse', 'ref': 'Mark 1:3'} + +def test_navigation(): + doc = Document(f"{local_dir}/tei/base_tei.xml") + refs = { + tree: [ref.json() for ref in obj.find_refs(doc.xml, structure=obj.units)] + for tree, obj in doc.citeStructure.items() + } + paths = {tree: generate_paths(ref) for tree, ref in refs.items()} + assert get_nav(refs[None], paths[None], start_or_ref=None, end=None, down=1) == ([ + {'citeType': 'book', 'ref': 'Luke'}, + {'citeType': 'book', 'ref': 'Mark'} + ], None, None) + assert get_nav(refs[None], paths[None], start_or_ref="Luke 1:1", end="Luke 1#1", down=0) == ([ + {'citeType': 'verse', 'ref': 'Luke 1:1'}, + {'citeType': 'verse', 'ref': 'Luke 1:2'}, + {'citeType': 'bloup', 'ref': 'Luke 1#1'} + ], {'citeType': 'verse', 'ref': 'Luke 1:1'}, {'citeType': 'bloup', 'ref': 'Luke 1#1'}) + assert get_nav(refs[None], paths[None], start_or_ref="Luke 1:1", end="Mark 1:2", down=0) == ([ + {'citeType': 'verse', 'ref': 'Luke 1:1'}, + {'citeType': 'verse', 'ref': 'Luke 1:2'}, + {'citeType': 'bloup', 'ref': 'Luke 1#1'}, + {'citeType': 'verse', 'ref': 'Mark 1:1'}, + {'citeType': 'verse', 'ref': 'Mark 1:2'} + ], {'citeType': 'verse', 'ref': 'Luke 1:1'}, {'citeType': 'verse', 'ref': 'Mark 1:2'}) \ No newline at end of file From 0e2257089422f563fc51fce66b5b0c9a5a2aee6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Sat, 24 Aug 2024 21:45:55 +0200 Subject: [PATCH 04/16] Updating workflow --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f057f9a..64ef997 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -14,7 +14,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Display Python version From 2867cd040680adc41ae823286fe3dd93b356f746 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Sun, 25 Aug 2024 09:44:01 +0200 Subject: [PATCH 05/16] [WIP] Navigation function is a go --- dapitains/app/ingest.py | 15 +++++-- dapitains/tei/citeStructure.py | 35 ++++++++++----- tests/test_citeStructure.py | 51 +++++++++++----------- tests/test_db_create.py | 78 ++++++++++++++++++++++++++-------- 4 files changed, 123 insertions(+), 56 deletions(-) diff --git a/dapitains/app/ingest.py b/dapitains/app/ingest.py index 91151b8..c46a2fb 100644 --- a/dapitains/app/ingest.py +++ b/dapitains/app/ingest.py @@ -80,17 +80,24 @@ def recurse(items, current_path): def get_nav( refs: List[Dict[str, Any]], paths: Dict[str, List[int]], - start_or_ref: Optional[str], - end: Optional[str], + start_or_ref: Optional[str] = None, + end: Optional[str] = None, down: Optional[int] = 1 ) -> Tuple[List[Dict[str, Any]], Optional[Dict[str, Any]], Optional[Dict[str, Any]]]: + """ Given a references set and a path set, provide the CitableUnit from start to end at down level. + + """ paths_index = list(paths.keys()) start_index, end_index = None, None - if start_or_ref: - start_index = paths_index.index(start_or_ref) if end: end_index = paths_index.index(end) + 1 + if start_or_ref: + start_index = paths_index.index(start_or_ref) + if not end: + for index, reference in enumerate(paths_index[start_index+1:]): + if len(paths[start_or_ref]) == len(paths[reference]): + end_index = index + start_index + 1 paths = dict(list(paths.items())[start_index:end_index]) diff --git a/dapitains/tei/citeStructure.py b/dapitains/tei/citeStructure.py index 5767544..bdf2338 100644 --- a/dapitains/tei/citeStructure.py +++ b/dapitains/tei/citeStructure.py @@ -47,11 +47,15 @@ class CitableUnit: node: Optional[saxonlib.PyXdmNode] = None dublinCore: Dict[str, List[str]] = field(default_factory=lambda: defaultdict(list)) extension: Dict[str, List[str]] = field(default_factory=lambda: defaultdict(list)) + level: int = 1 + parent: Optional[str] = None def json(self): out = { "citeType": self.citeType, - "ref": self.ref + "ref": self.ref, + "level": self.level, + "parent": self.parent } if self.children: out["members"] = [ @@ -189,26 +193,30 @@ def _dispatch( child_xpath: str, structure: CitableStructure, xpath_processor: saxonlib.PyXPathProcessor, - unit: CitableUnit): + unit: CitableUnit, + level: int): # target = self.generate_xpath(child.ref) if len(structure.children) == 1: self.find_refs( root=xpath_processor.evaluate_single(child_xpath), structure=structure.children[0], - unit=unit + unit=unit, + level=level ) else: self.find_refs_from_branches( root=xpath_processor.evaluate_single(child_xpath), structure=structure.children, - unit=unit + unit=unit, + level=level ) def find_refs( self, root: saxonlib.PyXdmNode, structure: CitableStructure = None, - unit: Optional[CitableUnit] = None + unit: Optional[CitableUnit] = None, + level: int = 1 ) -> List[CitableUnit]: xpath_proc = get_xpath_proc(elem=root) prefix = (unit.ref + structure.delim) if unit else "" @@ -218,7 +226,9 @@ def find_refs( for value in xpath_proc.evaluate(f"{xpath_prefix}{structure.xpath}"): child = CitableUnit( citeType=structure.citeType, - ref=f"{prefix}{value.string_value}" + ref=f"{prefix}{value.string_value}", + parent=unit.ref if unit else None, + level=level ) if structure.metadata: @@ -238,7 +248,8 @@ def find_refs( child_xpath=self.generate_xpath(child.ref), structure=structure, xpath_processor=xpath_proc, - unit=child + unit=child, + level=level+1 ) return units @@ -246,7 +257,8 @@ def find_refs_from_branches( self, root: saxonlib.PyXdmNode, structure: List[CitableStructure], - unit: Optional[CitableUnit] = None + unit: Optional[CitableUnit] = None, + level: int = 1 ) -> List[CitableUnit]: xpath_proc = get_xpath_proc(elem=root) prefix = (unit.ref) if unit else "" # ToDo: Reinject delim @@ -281,7 +293,9 @@ def compare_nodes_by_doc_order(node1, node2): for elem in unsorted: child_unit = CitableUnit( citeType=elem.struct.citeType, - ref=elem.citation + ref=elem.citation, + level=level, + parent=unit.ref if unit else None ) if unit: @@ -294,7 +308,8 @@ def compare_nodes_by_doc_order(node1, node2): child_xpath=self.generate_xpath(child_unit.ref), structure=elem.struct, xpath_processor=xpath_proc, - unit=child_unit + unit=child_unit, + level=level+1 ) return units diff --git a/tests/test_citeStructure.py b/tests/test_citeStructure.py index e73edd0..0dca40d 100644 --- a/tests/test_citeStructure.py +++ b/tests/test_citeStructure.py @@ -59,19 +59,20 @@ def test_parsing(): assert parser.generate_xpath("Luke") == "//body/div[@n='Luke']" assert [root.json() for root in parser.find_refs(root=TEI, structure=parser.units)] == [ - {'citeType': 'book', 'ref': 'Luke', 'members': [ - {'citeType': 'chapter', 'ref': 'Luke 1', 'members': [ - {'citeType': 'verse', 'ref': 'Luke 1:1'}, - {'citeType': 'verse', 'ref': 'Luke 1:2'}, - {'citeType': 'bloup', 'ref': 'Luke 1#1'} - ]} + {'citeType': 'book', 'ref': 'Luke', 'parent': None, 'level': 1, 'members': [ + {'citeType': 'chapter', 'ref': 'Luke 1', 'parent': 'Luke', 'level': 2, 'members': [ + {'citeType': 'verse', 'ref': 'Luke 1:1', 'parent': 'Luke 1', 'level': 3}, + {'citeType': 'verse', 'ref': 'Luke 1:2', 'parent': 'Luke 1', 'level': 3}, + {'citeType': 'bloup', 'ref': 'Luke 1#1', 'parent': 'Luke 1', 'level': 3} + ] + } ]}, - {'citeType': 'book', 'ref': 'Mark', 'members': [ - {'citeType': 'chapter', 'ref': 'Mark 1', 'members': [ - {'citeType': 'verse', 'ref': 'Mark 1:1'}, - {'citeType': 'verse', 'ref': 'Mark 1:2'}, - {'citeType': 'bloup', 'ref': 'Mark 1#1'}, - {'citeType': 'verse', 'ref': 'Mark 1:3'} + {'citeType': 'book', 'ref': 'Mark', 'parent': None, 'level': 1, 'members': [ + {'citeType': 'chapter', 'ref': 'Mark 1', 'parent': 'Mark', 'level': 2, 'members': [ + {'citeType': 'verse', 'ref': 'Mark 1:1', 'parent': 'Mark 1', 'level': 3}, + {'citeType': 'verse', 'ref': 'Mark 1:2', 'parent': 'Mark 1', 'level': 3}, + {'citeType': 'bloup', 'ref': 'Mark 1#1', 'parent': 'Mark 1', 'level': 3}, + {'citeType': 'verse', 'ref': 'Mark 1:3', 'parent': 'Mark 1', 'level': 3} ]} ]} ] @@ -84,17 +85,17 @@ def test_cite_data(): refs = parser.find_refs(root=TEI, structure=parser.units) refs = [ref.json() for ref in refs] assert refs == [ - {'citeType': 'book', 'ref': '1', 'dublinCore': { + {'citeType': 'book', 'ref': '1', 'parent': None, 'level': 1, 'dublinCore': { 'http://purl.org/dc/terms/title': ['Introduction', 'Introduction'], 'http://purl.org/dc/terms/creator': ['John Doe']}}, - {'citeType': 'book', 'ref': '2', 'dublinCore': {'http://purl.org/dc/terms/title': ["Background", 'Contexte']}}, - {'citeType': 'book', 'ref': '3', 'dublinCore': { + {'citeType': 'book', 'ref': '2', 'parent': None, 'level': 1, 'dublinCore': {'http://purl.org/dc/terms/title': ["Background", 'Contexte']}}, + {'citeType': 'book', 'ref': '3', 'parent': None, 'level': 1, 'dublinCore': { 'http://purl.org/dc/terms/title': ['Methodology', 'Méthodologie'], 'http://purl.org/dc/terms/creator': ['Albert Einstein']}}, - {'citeType': 'book', 'ref': '4', 'dublinCore': { + {'citeType': 'book', 'ref': '4', 'parent': None, 'level': 1, 'dublinCore': { 'http://purl.org/dc/terms/title': ['Results', 'Résultats'], 'http://purl.org/dc/terms/creator': ['Isaac Newton']}}, - {'citeType': 'book', 'ref': '5', 'dublinCore': { + {'citeType': 'book', 'ref': '5', 'parent': None, 'level': 1, 'dublinCore': { 'http://purl.org/dc/terms/title': ['Conclusion', 'Conclusion'], 'http://purl.org/dc/terms/creator': ['Marie Curie'] }}] @@ -108,24 +109,24 @@ def test_advanced_cite_data(): refs = parser.find_refs(root=TEI, structure=parser.units) refs = [ref.json() for ref in refs] assert refs == [ - {'citeType': 'part', 'ref': 'part-1', 'members': [ - {'citeType': 'book', 'ref': 'part-1.1', 'dublinCore': { + {'citeType': 'part', 'ref': 'part-1', 'parent': None, 'level': 1, 'members': [ + {'citeType': 'book', 'ref': 'part-1.1', 'parent': 'part-1', 'level': 2, 'dublinCore': { 'http://purl.org/dc/terms/title': ['Introduction', 'Introduction'], 'http://purl.org/dc/terms/creator': ['John Doe']}}, - {'citeType': 'book', 'ref': 'part-1.2', 'dublinCore': { + {'citeType': 'book', 'ref': 'part-1.2', 'parent': 'part-1', 'level': 2, 'dublinCore': { 'http://purl.org/dc/terms/title': ["Background", 'Contexte'] }} ], 'extension': {"http://foo.bar/part": ["1"]}}, - {'citeType': 'part', 'ref': 'part-2', 'members': [ - {'citeType': 'book', 'ref': 'part-2.3', 'dublinCore': { + {'citeType': 'part', 'ref': 'part-2', 'parent': None, 'level': 1, 'members': [ + {'citeType': 'book', 'ref': 'part-2.3', 'parent': 'part-2', 'level': 2, 'dublinCore': { 'http://purl.org/dc/terms/title': ['Methodology', 'Méthodologie'], 'http://purl.org/dc/terms/creator': ['Albert Einstein']}}, - {'citeType': 'book', 'ref': 'part-2.4', 'dublinCore': { + {'citeType': 'book', 'ref': 'part-2.4', 'parent': 'part-2', 'level': 2, 'dublinCore': { 'http://purl.org/dc/terms/title': ['Results', 'Résultats'], 'http://purl.org/dc/terms/creator': ['Isaac Newton']}} ], 'extension': {"http://foo.bar/part": ["2"]}}, - {'citeType': 'part', 'ref': 'part-3', 'members': [ - {'citeType': 'book', 'ref': 'part-3.5', 'dublinCore': { + {'citeType': 'part', 'ref': 'part-3', 'parent': None, 'level': 1, 'members': [ + {'citeType': 'book', 'ref': 'part-3.5', 'parent': 'part-3', 'level': 2, 'dublinCore': { 'http://purl.org/dc/terms/title': ['Conclusion', 'Conclusion'], 'http://purl.org/dc/terms/creator': ['Marie Curie'] }} diff --git a/tests/test_db_create.py b/tests/test_db_create.py index e8dcdf4..edf4080 100644 --- a/tests/test_db_create.py +++ b/tests/test_db_create.py @@ -48,8 +48,12 @@ def test_simple_path(): "Mark 1:3": [1, 0, 3] } } - assert strip_members(get_member_by_path(refs[None], paths[None]["Luke"])) == {'citeType': 'book', 'ref': 'Luke'} - assert get_member_by_path(refs[None], paths[None]["Mark 1:3"]) == {'citeType': 'verse', 'ref': 'Mark 1:3'} + assert strip_members( + get_member_by_path(refs[None], paths[None]["Luke"]) + ) == {'citeType': 'book', 'ref': 'Luke', "level": 1, "parent": None}, "Check that members are stripped" + assert get_member_by_path( + refs[None], paths[None]["Mark 1:3"] + ) == {'citeType': 'verse', 'ref': 'Mark 1:3', "level": 3, "parent": "Mark 1"} def test_navigation(): @@ -59,19 +63,59 @@ def test_navigation(): for tree, obj in doc.citeStructure.items() } paths = {tree: generate_paths(ref) for tree, ref in refs.items()} + assert get_nav(refs[None], paths[None], start_or_ref=None, end=None, down=1) == ([ - {'citeType': 'book', 'ref': 'Luke'}, - {'citeType': 'book', 'ref': 'Mark'} - ], None, None) - assert get_nav(refs[None], paths[None], start_or_ref="Luke 1:1", end="Luke 1#1", down=0) == ([ - {'citeType': 'verse', 'ref': 'Luke 1:1'}, - {'citeType': 'verse', 'ref': 'Luke 1:2'}, - {'citeType': 'bloup', 'ref': 'Luke 1#1'} - ], {'citeType': 'verse', 'ref': 'Luke 1:1'}, {'citeType': 'bloup', 'ref': 'Luke 1#1'}) - assert get_nav(refs[None], paths[None], start_or_ref="Luke 1:1", end="Mark 1:2", down=0) == ([ - {'citeType': 'verse', 'ref': 'Luke 1:1'}, - {'citeType': 'verse', 'ref': 'Luke 1:2'}, - {'citeType': 'bloup', 'ref': 'Luke 1#1'}, - {'citeType': 'verse', 'ref': 'Mark 1:1'}, - {'citeType': 'verse', 'ref': 'Mark 1:2'} - ], {'citeType': 'verse', 'ref': 'Luke 1:1'}, {'citeType': 'verse', 'ref': 'Mark 1:2'}) \ No newline at end of file + {'citeType': 'book', 'ref': 'Luke', "level": 1, "parent": None}, + {'citeType': 'book', 'ref': 'Mark', "level": 1, "parent": None} + ], None, None), "Check that base function works" + + assert get_nav(refs[None], paths[None], start_or_ref="Luke 1:1", end="Luke 1#1", down=0) == ( + [ + {'citeType': 'verse', 'ref': 'Luke 1:1', "level": 3, "parent": "Luke 1"}, + {'citeType': 'verse', 'ref': 'Luke 1:2', "level": 3, "parent": "Luke 1"}, + {'citeType': 'bloup', 'ref': 'Luke 1#1', "level": 3, "parent": "Luke 1"} + ], + {'citeType': 'verse', 'ref': 'Luke 1:1', "level": 3, "parent": "Luke 1"}, + {'citeType': 'bloup', 'ref': 'Luke 1#1', "level": 3, "parent": "Luke 1"} + ), "Check that ?start/end works" + + assert get_nav(refs[None], paths[None], start_or_ref="Luke 1:1", end="Mark 1:2", down=0) == ( + [ + {'citeType': 'verse', 'ref': 'Luke 1:1', "level": 3, "parent": "Luke 1"}, + {'citeType': 'verse', 'ref': 'Luke 1:2', "level": 3, "parent": "Luke 1"}, + {'citeType': 'bloup', 'ref': 'Luke 1#1', "level": 3, "parent": "Luke 1"}, + {'citeType': 'verse', 'ref': 'Mark 1:1', "level": 3, "parent": "Mark 1"}, + {'citeType': 'verse', 'ref': 'Mark 1:2', "level": 3, "parent": "Mark 1"} + ], + {'citeType': 'verse', 'ref': 'Luke 1:1', "level": 3, "parent": "Luke 1"}, + {'citeType': 'verse', 'ref': 'Mark 1:2', "level": 3, "parent": "Mark 1"} + ), "Check that ?start/end works across parents" + + assert get_nav(refs[None], paths[None], start_or_ref="Luke 1", down=1) == ( + [ + {'citeType': 'verse', 'ref': 'Luke 1:1', "level": 3, "parent": "Luke 1"}, + {'citeType': 'verse', 'ref': 'Luke 1:2', "level": 3, "parent": "Luke 1"}, + {'citeType': 'bloup', 'ref': 'Luke 1#1', "level": 3, "parent": "Luke 1"} + ], + {'citeType': 'chapter', 'ref': 'Luke 1', "level": 2, "parent": "Luke"}, + None + ), "Check that ?ref works" + + assert get_nav(refs[None], paths[None], start_or_ref="Luke", down=1) == ( + [ + {'citeType': 'chapter', 'ref': 'Luke 1', "level": 2, "parent": "Luke"}, + ], + {'citeType': 'book', 'ref': 'Luke', "level": 1, "parent": None}, + None + ), "Check that ?ref works" + + assert get_nav(refs[None], paths[None], start_or_ref=None, end=None, down=2) == ( + [ + {'citeType': 'book', 'ref': 'Luke', "level": 1, "parent": None}, + {'citeType': 'chapter', 'ref': 'Luke 1', "level": 2, "parent": "Luke"}, + {'citeType': 'book', 'ref': 'Mark', "level": 1, "parent": None}, + {'citeType': 'chapter', 'ref': 'Mark 1', "level": 2, "parent": "Mark"} + ], + None, + None + ), "Check that down=2 works" \ No newline at end of file From 98afe9120251e9de2f42214e2e6cc82fdba1f79d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Sun, 25 Aug 2024 11:48:15 +0200 Subject: [PATCH 06/16] [WIP] Navigation route. Case of down=0 not well implemented. Navigation is unavailable in the Database --- dapitains/app/app.py | 98 +++++++++++++++++++++++++++++++++++++++ dapitains/app/database.py | 6 +-- dapitains/app/ingest.py | 12 ++++- 3 files changed, 111 insertions(+), 5 deletions(-) create mode 100644 dapitains/app/app.py diff --git a/dapitains/app/app.py b/dapitains/app/app.py new file mode 100644 index 0000000..a1c77ae --- /dev/null +++ b/dapitains/app/app.py @@ -0,0 +1,98 @@ +from typing import Dict, Any +try: + import uritemplate + from flask import Flask, request, Response + from flask_sqlalchemy import SQLAlchemy + import click +except ImportError: + print("This part of the package can only be imported with the web requirements.") + raise + +import json + +from dapitains.app.database import db, Collection, Navigation +from dapitains.app.ingest import get_nav + +def msg_4xx(string, code=404) -> Response: + return Response(json.dumps({"message": string}), status=code, mimetype="application/json") + + + + +def create_app( + app: Flask, + use_query: bool = False, + # navigation_template: str = "/navigation?resource=https://en.wikisource.org/wiki/Dracula{&ref,down,start,end,tree,page}" +) -> (Flask, SQLAlchemy): + """ + + Initialisation of the DB is up to you + """ + + @app.route("/navigation") + def navigation_route(): + resource = request.args.get("resource") + ref = request.args.get("ref") + start = request.args.get("start") + end = request.args.get("end") + tree = request.args.get("tree") + down = request.args.get("down", type=int, default=None) + + if not resource: + return msg_4xx("Resource parameter was not provided") + + collection: Collection = Collection.query.where(Collection.identifier == resource).first() + if not collection: + return msg_4xx(f"Unknown resource `{resource}`") + nav: Navigation = collection.navigation + + # Check for forbidden combinations + if ref or start or end: + if tree not in nav.references: + return msg_4xx(f"Unknown tree {tree} for resource `{resource}`") + elif ref and (start or end): + return msg_4xx(f"You cannot provide a ref parameter as well as start or end", code=400) + elif not ref and ((start and not end) or (end and not start)): + return msg_4xx(f"Range is missing one of its parameters (start or end)", code=400) + elif down is None and not (ref or start or end): + return msg_4xx(f"The down query parameter is required when requesting without ref or start/end", code=400) + + refs = nav.references[tree] + paths = nav.paths[tree] + members, start, end = get_nav(refs=refs, paths=paths, start_or_ref=start or ref, end=end, down=down) + + return { + "@context": "https://distributed-text-services.github.io/specifications/context/1-alpha1.json", + "dtsVersion": "1-alpha", + "@type": "Navigation", + "@id": "https://example.org/api/dts/navigation/?resource=https://en.wikisource.org/wiki/Dracula&down=1", + "resource": collection.json(), # To Do: implement and inject URI templates + "members": members + } + + return app, db + + +if __name__ == "__main__": + import os + from dapitains.app.ingest import store_catalog + from dapitains.metadata.xml_parser import ingest_catalog + + app = Flask(__name__) + _, db = create_app(app) + + basedir = os.path.abspath(os.path.dirname(__file__)) + db_path = os.path.join(basedir, 'app.db') + app.config['SQLALCHEMY_DATABASE_URI'] = f'sqlite:///{db_path}' + app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False + + db.init_app(app) + with app.app_context(): + db.drop_all() + db.create_all() + + catalog, _ = ingest_catalog("/home/thibault/dev/MyDapytains/tests/catalog/example-collection.xml") + print(catalog) + store_catalog(catalog) + + app.run() \ No newline at end of file diff --git a/dapitains/app/database.py b/dapitains/app/database.py index 76f7259..7b2621e 100644 --- a/dapitains/app/database.py +++ b/dapitains/app/database.py @@ -39,7 +39,7 @@ def process_result_value(self, value, dialect): class Collection(db.Model): __tablename__ = 'collections' - id = db.Column(db.Integer, primary_key=True, autoincrement=True) + id = db.Column(db.Integer, primary_key=True, autoincrement=True, nullable=False) identifier = db.Column(db.String, nullable=False, unique=True) title = db.Column(db.String, nullable=False) description = db.Column(db.String, nullable=True) @@ -76,9 +76,9 @@ def from_class(cls, obj: abstracts.Collection) -> "Collection": class Navigation(db.Model): __tablename__ = 'navigations' - id = db.Column(db.Integer, primary_key=True) + id = db.Column(db.Integer, primary_key=True, autoincrement=True, nullable=False) collection_id = db.Column(db.Integer, db.ForeignKey('collections.id'), nullable=False, unique=True) - default_tree = db.Column(db.String, nullable=True) + # default_tree = db.Column(db.String, nullable=True) # JSON fields stored as TEXT paths = db.Column(JSONEncoded, nullable=False, default={}) diff --git a/dapitains/app/ingest.py b/dapitains/app/ingest.py index c46a2fb..68bcdea 100644 --- a/dapitains/app/ingest.py +++ b/dapitains/app/ingest.py @@ -6,12 +6,20 @@ def store_catalog(catalog: Catalog): for identifier, collection in catalog.objects.items(): - db.session.add(Collection.from_class(collection)) + coll_db = Collection.from_class(collection) + db.session.add(coll_db) + db.session.flush() if collection.resource: doc = Document(collection.filepath) references = { - key: struct.find_refs(root=doc.xml, structure=struct.units) for key, struct in doc.citeStructure.items() + tree: [ref.json() for ref in obj.find_refs(doc.xml, structure=obj.units)] + for tree, obj in doc.citeStructure.items() } + paths = {key: generate_paths(tree) for key, tree in references.items()} + nav = Navigation(collection_id=coll_db.id, paths=paths, references=references) + db.session.add(nav) + db.session.commit() + def get_member_by_path(data: List[Dict[str, Any]], path: List[int]) -> Optional[Dict[str, Any]]: From f148f84a92007f8d0832cb448eb2bd1cf44af056 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Sun, 25 Aug 2024 12:47:53 +0200 Subject: [PATCH 07/16] Working base Nav --- dapitains/app/app.py | 27 ++++++++++++++------------- dapitains/app/database.py | 12 ++++++++++-- dapitains/app/ingest.py | 1 + 3 files changed, 25 insertions(+), 15 deletions(-) diff --git a/dapitains/app/app.py b/dapitains/app/app.py index a1c77ae..0c35594 100644 --- a/dapitains/app/app.py +++ b/dapitains/app/app.py @@ -17,8 +17,6 @@ def msg_4xx(string, code=404) -> Response: return Response(json.dumps({"message": string}), status=code, mimetype="application/json") - - def create_app( app: Flask, use_query: bool = False, @@ -44,7 +42,10 @@ def navigation_route(): collection: Collection = Collection.query.where(Collection.identifier == resource).first() if not collection: return msg_4xx(f"Unknown resource `{resource}`") - nav: Navigation = collection.navigation + + nav: Navigation = Navigation.query.where(Navigation.collection_id == collection.id).first() + if nav is None: + return msg_4xx(f"The resource `{resource}` does not support navigation") # Check for forbidden combinations if ref or start or end: @@ -54,7 +55,8 @@ def navigation_route(): return msg_4xx(f"You cannot provide a ref parameter as well as start or end", code=400) elif not ref and ((start and not end) or (end and not start)): return msg_4xx(f"Range is missing one of its parameters (start or end)", code=400) - elif down is None and not (ref or start or end): + else: + if down is None: return msg_4xx(f"The down query parameter is required when requesting without ref or start/end", code=400) refs = nav.references[tree] @@ -66,8 +68,8 @@ def navigation_route(): "dtsVersion": "1-alpha", "@type": "Navigation", "@id": "https://example.org/api/dts/navigation/?resource=https://en.wikisource.org/wiki/Dracula&down=1", - "resource": collection.json(), # To Do: implement and inject URI templates - "members": members + #"resource": collection.json(), # To Do: implement and inject URI templates + "member": members } return app, db @@ -87,12 +89,11 @@ def navigation_route(): app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False db.init_app(app) - with app.app_context(): - db.drop_all() - db.create_all() - - catalog, _ = ingest_catalog("/home/thibault/dev/MyDapytains/tests/catalog/example-collection.xml") - print(catalog) - store_catalog(catalog) + # with app.app_context(): + # db.drop_all() + # db.create_all() + # + # catalog, _ = ingest_catalog("/home/thibault/dev/MyDapytains/tests/catalog/example-collection.xml") + # store_catalog(catalog) app.run() \ No newline at end of file diff --git a/dapitains/app/database.py b/dapitains/app/database.py index 7b2621e..77ba48b 100644 --- a/dapitains/app/database.py +++ b/dapitains/app/database.py @@ -13,6 +13,14 @@ import json +class CustomKeyJSONDecoder(json.JSONDecoder): + def __init__(self, *args, **kwargs): + super().__init__(object_hook=self.object_hook, *args, **kwargs) + + def object_hook(self, obj): + # Only convert 'None' string keys back to None + return {None if k == 'null' else k: v for k, v in obj.items()} + db = SQLAlchemy() parent_child_association = db.Table('parent_child_association', @@ -34,7 +42,7 @@ def process_bind_param(self, value, dialect): def process_result_value(self, value, dialect): if value is None: return None - return json.loads(value) + return json.loads(value, cls=CustomKeyJSONDecoder) class Collection(db.Model): __tablename__ = 'collections' @@ -49,7 +57,7 @@ class Collection(db.Model): extensions = db.Column(JSONEncoded, nullable=True) # One-to-one relationship with Navigation - navigation = db.relationship('Navigation', uselist=False, backref='collection', lazy='noload') + navigation = db.relationship('Navigation', uselist=False, backref='collection', lazy=True) parents = db.relationship( diff --git a/dapitains/app/ingest.py b/dapitains/app/ingest.py index 68bcdea..51192df 100644 --- a/dapitains/app/ingest.py +++ b/dapitains/app/ingest.py @@ -17,6 +17,7 @@ def store_catalog(catalog: Catalog): } paths = {key: generate_paths(tree) for key, tree in references.items()} nav = Navigation(collection_id=coll_db.id, paths=paths, references=references) + print(nav.paths) db.session.add(nav) db.session.commit() From 47ec9853cc08bab1ae018c714a0eac7178b98747 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Sun, 25 Aug 2024 14:30:26 +0200 Subject: [PATCH 08/16] [WIP] Ability to show the resource in json --- dapitains/app/app.py | 14 +++++++------- dapitains/app/database.py | 16 ++++++++++++++++ dapitains/metadata/classes.py | 10 ++++++++-- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/dapitains/app/app.py b/dapitains/app/app.py index 0c35594..ddb46ae 100644 --- a/dapitains/app/app.py +++ b/dapitains/app/app.py @@ -68,7 +68,7 @@ def navigation_route(): "dtsVersion": "1-alpha", "@type": "Navigation", "@id": "https://example.org/api/dts/navigation/?resource=https://en.wikisource.org/wiki/Dracula&down=1", - #"resource": collection.json(), # To Do: implement and inject URI templates + "resource": collection.json(), # To Do: implement and inject URI templates "member": members } @@ -89,11 +89,11 @@ def navigation_route(): app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False db.init_app(app) - # with app.app_context(): - # db.drop_all() - # db.create_all() - # - # catalog, _ = ingest_catalog("/home/thibault/dev/MyDapytains/tests/catalog/example-collection.xml") - # store_catalog(catalog) + with app.app_context(): + db.drop_all() + db.create_all() + + catalog, _ = ingest_catalog("/home/thibault/dev/MyDapytains/tests/catalog/example-collection.xml") + store_catalog(catalog) app.run() \ No newline at end of file diff --git a/dapitains/app/database.py b/dapitains/app/database.py index 77ba48b..e606dc6 100644 --- a/dapitains/app/database.py +++ b/dapitains/app/database.py @@ -7,6 +7,7 @@ print("This part of the package can only be imported with the web requirements.") raise +from typing import Optional, Dict, Any import dapitains.metadata.classes as abstracts from dapitains.metadata.xml_parser import Catalog from dapitains.tei.document import Document @@ -68,6 +69,21 @@ class Collection(db.Model): backref='children' ) + def json(self, inject: Optional[Dict[str, Any]] = None): + data = { + "@type": "Resource" if self.resource else "Collection", + "title": self.title, + **(inject or {}) + } + if self.description: + data["description"] = self.description + if self.dublin_core: + data["dublinCore"] = self.dublin_core + if self.extensions: + data["extensions"] = self.extensions + + return data + @classmethod def from_class(cls, obj: abstracts.Collection) -> "Collection": return cls( diff --git a/dapitains/metadata/classes.py b/dapitains/metadata/classes.py index a77f694..09de602 100644 --- a/dapitains/metadata/classes.py +++ b/dapitains/metadata/classes.py @@ -9,7 +9,10 @@ class DublinCore: language: Optional[str] = None def json(self): - return {"property": f"http://purl.org/dc/terms/{self.term}", "value": self.value, "language": self.language} + if self.language: + return {"property": f"http://purl.org/dc/terms/{self.term}", "value": self.value, "lang": self.language} + else: + return {"property": f"http://purl.org/dc/terms/{self.term}", "value": self.value} class Extension(DublinCore): @@ -18,7 +21,10 @@ class Extension(DublinCore): language: Optional[str] = None def json(self): - return {"property": self.term, "value": self.value, "language": self.language} + if self.language: + return {"property": self.term, "value": self.value, "language": self.language} + else: + return {"property": self.term, "value": self.value} @dataclass From 54807c4fb01a733067e9599ad9c3a490b471646f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Sun, 25 Aug 2024 21:13:06 +0200 Subject: [PATCH 09/16] URI templates implemented --- dapitains/app/app.py | 88 ++++++++++++++++++++++++++------------------ 1 file changed, 52 insertions(+), 36 deletions(-) diff --git a/dapitains/app/app.py b/dapitains/app/app.py index ddb46ae..43c23ad 100644 --- a/dapitains/app/app.py +++ b/dapitains/app/app.py @@ -1,4 +1,9 @@ from typing import Dict, Any + +from sqlalchemy.orm.collections import collection + +from tests.test_db_create import test_navigation + try: import uritemplate from flask import Flask, request, Response @@ -17,6 +22,44 @@ def msg_4xx(string, code=404) -> Response: return Response(json.dumps({"message": string}), status=code, mimetype="application/json") +def navigation_view(resource, ref, start, end, tree, down, templates): + if not resource: + return msg_4xx("Resource parameter was not provided") + + collection: Collection = Collection.query.where(Collection.identifier == resource).first() + if not collection: + return msg_4xx(f"Unknown resource `{resource}`") + + nav: Navigation = Navigation.query.where(Navigation.collection_id == collection.id).first() + if nav is None: + return msg_4xx(f"The resource `{resource}` does not support navigation") + + # Check for forbidden combinations + if ref or start or end: + if tree not in nav.references: + return msg_4xx(f"Unknown tree {tree} for resource `{resource}`") + elif ref and (start or end): + return msg_4xx(f"You cannot provide a ref parameter as well as start or end", code=400) + elif not ref and ((start and not end) or (end and not start)): + return msg_4xx(f"Range is missing one of its parameters (start or end)", code=400) + else: + if down is None: + return msg_4xx(f"The down query parameter is required when requesting without ref or start/end", code=400) + + refs = nav.references[tree] + paths = nav.paths[tree] + members, start, end = get_nav(refs=refs, paths=paths, start_or_ref=start or ref, end=end, down=down) + print(templates) + return { + "@context": "https://distributed-text-services.github.io/specifications/context/1-alpha1.json", + "dtsVersion": "1-alpha", + "@type": "Navigation", + "@id": "https://example.org/api/dts/navigation/?resource=https://en.wikisource.org/wiki/Dracula&down=1", + "resource": collection.json(inject=templates), # To Do: implement and inject URI templates + "member": members + } + + def create_app( app: Flask, use_query: bool = False, @@ -26,8 +69,11 @@ def create_app( Initialisation of the DB is up to you """ + navigation_template = uritemplate.URITemplate("/navigation/{?resource}{&ref,start,end,tree,down}") + collection_template = uritemplate.URITemplate("/navigation/collection/{?id,page,nav}") + document_template = uritemplate.URITemplate("/document/{?resource}{&ref,start,end,tree}") - @app.route("/navigation") + @app.route("/navigation/") def navigation_route(): resource = request.args.get("resource") ref = request.args.get("ref") @@ -36,41 +82,11 @@ def navigation_route(): tree = request.args.get("tree") down = request.args.get("down", type=int, default=None) - if not resource: - return msg_4xx("Resource parameter was not provided") - - collection: Collection = Collection.query.where(Collection.identifier == resource).first() - if not collection: - return msg_4xx(f"Unknown resource `{resource}`") - - nav: Navigation = Navigation.query.where(Navigation.collection_id == collection.id).first() - if nav is None: - return msg_4xx(f"The resource `{resource}` does not support navigation") - - # Check for forbidden combinations - if ref or start or end: - if tree not in nav.references: - return msg_4xx(f"Unknown tree {tree} for resource `{resource}`") - elif ref and (start or end): - return msg_4xx(f"You cannot provide a ref parameter as well as start or end", code=400) - elif not ref and ((start and not end) or (end and not start)): - return msg_4xx(f"Range is missing one of its parameters (start or end)", code=400) - else: - if down is None: - return msg_4xx(f"The down query parameter is required when requesting without ref or start/end", code=400) - - refs = nav.references[tree] - paths = nav.paths[tree] - members, start, end = get_nav(refs=refs, paths=paths, start_or_ref=start or ref, end=end, down=down) - - return { - "@context": "https://distributed-text-services.github.io/specifications/context/1-alpha1.json", - "dtsVersion": "1-alpha", - "@type": "Navigation", - "@id": "https://example.org/api/dts/navigation/?resource=https://en.wikisource.org/wiki/Dracula&down=1", - "resource": collection.json(), # To Do: implement and inject URI templates - "member": members - } + return navigation_view(resource, ref, start, end, tree, down, templates={ + "navigation": navigation_template.partial({"resource": resource}).uri, + "collection": collection_template.partial({"id": resource}).uri, + "document": document_template.partial({"resource": resource}).uri, + }) return app, db From 08ace005137dd7e5b5b2a617ffc56303e17a1649 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Mon, 26 Aug 2024 07:28:18 +0200 Subject: [PATCH 10/16] Nearly working collection --- dapitains/app/app.py | 63 ++++++++++++++++++++++++++++++++------- dapitains/app/database.py | 17 ++++++++++- dapitains/app/ingest.py | 16 ++++++++-- requirements.txt | 3 +- 4 files changed, 84 insertions(+), 15 deletions(-) diff --git a/dapitains/app/app.py b/dapitains/app/app.py index 43c23ad..5a9c2d1 100644 --- a/dapitains/app/app.py +++ b/dapitains/app/app.py @@ -1,4 +1,4 @@ -from typing import Dict, Any +from typing import Dict, Any, Optional from sqlalchemy.orm.collections import collection @@ -18,11 +18,45 @@ from dapitains.app.database import db, Collection, Navigation from dapitains.app.ingest import get_nav + def msg_4xx(string, code=404) -> Response: return Response(json.dumps({"message": string}), status=code, mimetype="application/json") -def navigation_view(resource, ref, start, end, tree, down, templates): +def collection_view(identifier: Optional[str], nav: str, templates: Dict[str, str]) -> Response: + if not identifier: + coll: Collection = db.session.query(Collection).filter(~Collection.parents.any()).first() + else: + coll = Collection.query.where(Collection.identifier==identifier).first() + if coll is None: + return msg_4xx("Unknown collection") + out = coll.json() + + if nav == 'children': + related_collections = db.session.query(Collection).filter( + Collection.parents.any(id=coll.id) + ).all() + elif nav == 'parents': + related_collections = db.session.query(Collection).filter( + Collection.children.any(id=coll.id) + ).all() + else: + return msg_4xx(f"nav parameter has a wrong value {nav}", code=400) + + return Response(json.dumps({ + "@context": "https://distributed-text-services.github.io/specifications/context/1-alpha1.json", + "dtsVersion": "1-alpha", + **out, + "totalParents": coll.total_parents, + "totalChildren": coll.total_children, + "member": [ + related.json() + for related in related_collections + ] + }), mimetype="application/json", status=200) + + +def navigation_view(resource, ref, start, end, tree, down, templates: Dict[str, str]) -> Response: if not resource: return msg_4xx("Resource parameter was not provided") @@ -49,30 +83,39 @@ def navigation_view(resource, ref, start, end, tree, down, templates): refs = nav.references[tree] paths = nav.paths[tree] members, start, end = get_nav(refs=refs, paths=paths, start_or_ref=start or ref, end=end, down=down) - print(templates) - return { + return Response(json.dumps({ "@context": "https://distributed-text-services.github.io/specifications/context/1-alpha1.json", "dtsVersion": "1-alpha", "@type": "Navigation", "@id": "https://example.org/api/dts/navigation/?resource=https://en.wikisource.org/wiki/Dracula&down=1", "resource": collection.json(inject=templates), # To Do: implement and inject URI templates "member": members - } + }), mimetype="application/json", status=200) def create_app( app: Flask, - use_query: bool = False, - # navigation_template: str = "/navigation?resource=https://en.wikisource.org/wiki/Dracula{&ref,down,start,end,tree,page}" + use_query: bool = False ) -> (Flask, SQLAlchemy): """ Initialisation of the DB is up to you """ navigation_template = uritemplate.URITemplate("/navigation/{?resource}{&ref,start,end,tree,down}") - collection_template = uritemplate.URITemplate("/navigation/collection/{?id,page,nav}") + collection_template = uritemplate.URITemplate("/navigation/collection/{?id,nav}") document_template = uritemplate.URITemplate("/document/{?resource}{&ref,start,end,tree}") + @app.route("/collection/") + def collection_route(): + resource = request.args.get("id") + nav = request.args.get("nav") + + return collection_view(resource, nav, templates={ + "navigation": navigation_template.partial({"resource": resource}).uri, + "collection": collection_template.partial({"id": resource}).uri, + "document": document_template.partial({"resource": resource}).uri, + }) + @app.route("/navigation/") def navigation_route(): resource = request.args.get("resource") @@ -109,7 +152,7 @@ def navigation_route(): db.drop_all() db.create_all() - catalog, _ = ingest_catalog("/home/thibault/dev/MyDapytains/tests/catalog/example-collection.xml") + catalog, _ = ingest_catalog(f"{basedir}/../../tests/catalog/example-collection.xml") store_catalog(catalog) - app.run() \ No newline at end of file + app.run() diff --git a/dapitains/app/database.py b/dapitains/app/database.py index e606dc6..9094e35 100644 --- a/dapitains/app/database.py +++ b/dapitains/app/database.py @@ -2,6 +2,7 @@ from flask_sqlalchemy import SQLAlchemy from sqlalchemy.ext.mutable import MutableDict, Mutable from sqlalchemy.types import TypeDecorator, TEXT + from sqlalchemy import func import click except ImportError: print("This part of the package can only be imported with the web requirements.") @@ -45,6 +46,7 @@ def process_result_value(self, value, dialect): return None return json.loads(value, cls=CustomKeyJSONDecoder) + class Collection(db.Model): __tablename__ = 'collections' @@ -60,7 +62,6 @@ class Collection(db.Model): # One-to-one relationship with Navigation navigation = db.relationship('Navigation', uselist=False, backref='collection', lazy=True) - parents = db.relationship( 'Collection', secondary=parent_child_association, @@ -69,9 +70,22 @@ class Collection(db.Model): backref='children' ) + @property + def total_children(self): + return db.session.query(func.count(parent_child_association.c.child_id)).filter( + parent_child_association.c.parent_id == self.id + ).scalar() + + @property + def total_parents(self): + return db.session.query(func.count(parent_child_association.c.parent_id)).filter( + parent_child_association.c.child_id == self.id + ).scalar() + def json(self, inject: Optional[Dict[str, Any]] = None): data = { "@type": "Resource" if self.resource else "Collection", + "@id": self.identifier, "title": self.title, **(inject or {}) } @@ -97,6 +111,7 @@ def from_class(cls, obj: abstracts.Collection) -> "Collection": extensions=[ext.json() for ext in obj.extension] ) + class Navigation(db.Model): __tablename__ = 'navigations' diff --git a/dapitains/app/ingest.py b/dapitains/app/ingest.py index 51192df..721a845 100644 --- a/dapitains/app/ingest.py +++ b/dapitains/app/ingest.py @@ -1,14 +1,18 @@ from typing import Dict, List, Optional, Any, Tuple -from dapitains.app.database import Collection, Navigation, db +from dapitains.app.database import Collection, Navigation, db, parent_child_association from dapitains.metadata.xml_parser import Catalog from dapitains.tei.document import Document import copy +import tqdm + def store_catalog(catalog: Catalog): - for identifier, collection in catalog.objects.items(): + keys = {} + for identifier, collection in tqdm.tqdm(catalog.objects.items(), desc="Parsing all collections"): coll_db = Collection.from_class(collection) db.session.add(coll_db) db.session.flush() + keys[coll_db.identifier] = coll_db.id if collection.resource: doc = Document(collection.filepath) references = { @@ -17,10 +21,16 @@ def store_catalog(catalog: Catalog): } paths = {key: generate_paths(tree) for key, tree in references.items()} nav = Navigation(collection_id=coll_db.id, paths=paths, references=references) - print(nav.paths) db.session.add(nav) db.session.commit() + for parent, child in catalog.relationships: + insert_statement = parent_child_association.insert().values( + parent_id=keys[parent], + child_id=keys[child] + ) + db.session.execute(insert_statement) + def get_member_by_path(data: List[Dict[str, Any]], path: List[int]) -> Optional[Dict[str, Any]]: diff --git a/requirements.txt b/requirements.txt index 980dd76..047482a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ saxonche==12.5.0 lxml flask flask-sqlalchemy -click \ No newline at end of file +click +uritemplate \ No newline at end of file From e5711dbf746238679998accb040e5ccc5e598117 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Mon, 26 Aug 2024 08:02:12 +0200 Subject: [PATCH 11/16] Adding TQDM --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 047482a..0a13a0c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ lxml flask flask-sqlalchemy click -uritemplate \ No newline at end of file +uritemplate +tqdm \ No newline at end of file From f35d203279a803879258aad5fc5ff329d982150b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Mon, 26 Aug 2024 08:57:53 +0200 Subject: [PATCH 12/16] Unworking --- dapitains/app/app.py | 36 ++++++++++++++++++++++++++++++------ dapitains/app/ingest.py | 1 + 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/dapitains/app/app.py b/dapitains/app/app.py index 5a9c2d1..4832ef0 100644 --- a/dapitains/app/app.py +++ b/dapitains/app/app.py @@ -23,7 +23,17 @@ def msg_4xx(string, code=404) -> Response: return Response(json.dumps({"message": string}), status=code, mimetype="application/json") -def collection_view(identifier: Optional[str], nav: str, templates: Dict[str, str]) -> Response: +def collection_view( + identifier: Optional[str], + nav: str, + templates: Dict[str, uritemplate.URITemplate] +) -> Response: + """ Builds a collection view, regardless of how the parameters are received + + :param identifier: + :param nav: + :param templates: + """ if not identifier: coll: Collection = db.session.query(Collection).filter(~Collection.parents.any()).first() else: @@ -49,8 +59,22 @@ def collection_view(identifier: Optional[str], nav: str, templates: Dict[str, st **out, "totalParents": coll.total_parents, "totalChildren": coll.total_children, + "collection": templates["collection"].uri, "member": [ - related.json() + ( + related.json(inject=(**{ + "collection": templates["collection"].partial({"id": related.identifier}).uri, + "document": templates["collection"].partial({"id": related.identifier}).uri, + }, **( + { + "navigation": templates["collection"].partial( + {"id": related.identifier}).uri, + } if hasattr(coll, "citeStructure") else {})) + if related.resource + else related.json({ + "collection": templates["collection"].partial({"id": related.identifier}).uri + }) + ) for related in related_collections ] }), mimetype="application/json", status=200) @@ -108,12 +132,12 @@ def create_app( @app.route("/collection/") def collection_route(): resource = request.args.get("id") - nav = request.args.get("nav") + nav = request.args.get("nav", "children") return collection_view(resource, nav, templates={ - "navigation": navigation_template.partial({"resource": resource}).uri, - "collection": collection_template.partial({"id": resource}).uri, - "document": document_template.partial({"resource": resource}).uri, + "navigation": navigation_template, + "collection": collection_template, + "document": document_template, }) @app.route("/navigation/") diff --git a/dapitains/app/ingest.py b/dapitains/app/ingest.py index 721a845..5372c54 100644 --- a/dapitains/app/ingest.py +++ b/dapitains/app/ingest.py @@ -30,6 +30,7 @@ def store_catalog(catalog: Catalog): child_id=keys[child] ) db.session.execute(insert_statement) + db.session.commit() From 9607ebdcc1827a11d4b5aec4a412f2394ae325b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Mon, 26 Aug 2024 09:12:42 +0200 Subject: [PATCH 13/16] Working citeStructure but None citeStructure are broken --- dapitains/app/app.py | 35 +++++++++++++++++----------------- dapitains/app/database.py | 5 ++++- dapitains/app/ingest.py | 21 ++++++++++++-------- dapitains/tei/citeStructure.py | 11 +++++++++++ 4 files changed, 46 insertions(+), 26 deletions(-) diff --git a/dapitains/app/app.py b/dapitains/app/app.py index 4832ef0..c039e64 100644 --- a/dapitains/app/app.py +++ b/dapitains/app/app.py @@ -61,22 +61,23 @@ def collection_view( "totalChildren": coll.total_children, "collection": templates["collection"].uri, "member": [ - ( - related.json(inject=(**{ - "collection": templates["collection"].partial({"id": related.identifier}).uri, - "document": templates["collection"].partial({"id": related.identifier}).uri, - }, **( - { - "navigation": templates["collection"].partial( - {"id": related.identifier}).uri, - } if hasattr(coll, "citeStructure") else {})) - if related.resource - else related.json({ - "collection": templates["collection"].partial({"id": related.identifier}).uri - }) - ) - for related in related_collections - ] + related.json( + inject=dict( + **{ + "collection": templates["collection"].partial({"id": related.identifier}).uri, + "document": templates["document"].partial({"resource": related.identifier}).uri, + }, + **( + { + "navigation": templates["navigation"].partial({"resource": related.identifier}).uri, + } if coll.citeStructure else {} + ) + ) if related.resource else related.json({ + "collection": templates["collection"].partial({"id": related.identifier}).uri + }) + ) + for related in related_collections + ] }), mimetype="application/json", status=200) @@ -126,7 +127,7 @@ def create_app( Initialisation of the DB is up to you """ navigation_template = uritemplate.URITemplate("/navigation/{?resource}{&ref,start,end,tree,down}") - collection_template = uritemplate.URITemplate("/navigation/collection/{?id,nav}") + collection_template = uritemplate.URITemplate("/collection/collection/{?id,nav}") document_template = uritemplate.URITemplate("/document/{?resource}{&ref,start,end,tree}") @app.route("/collection/") diff --git a/dapitains/app/database.py b/dapitains/app/database.py index 9094e35..954c8c6 100644 --- a/dapitains/app/database.py +++ b/dapitains/app/database.py @@ -58,6 +58,7 @@ class Collection(db.Model): filepath = db.Column(db.String, nullable=True) dublin_core = db.Column(JSONEncoded, nullable=True) extensions = db.Column(JSONEncoded, nullable=True) + citeStructure = db.Column(JSONEncoded, nullable=True) # One-to-one relationship with Navigation navigation = db.relationship('Navigation', uselist=False, backref='collection', lazy=True) @@ -91,7 +92,9 @@ def json(self, inject: Optional[Dict[str, Any]] = None): } if self.description: data["description"] = self.description - if self.dublin_core: + if self.citeStructure: + data["citeStructure"] = self.citeStructure + if self.dublin_core: # ToDo: Fix the way it's presented to adapt to dts view data["dublinCore"] = self.dublin_core if self.extensions: data["extensions"] = self.extensions diff --git a/dapitains/app/ingest.py b/dapitains/app/ingest.py index 5372c54..04cce1e 100644 --- a/dapitains/app/ingest.py +++ b/dapitains/app/ingest.py @@ -15,13 +15,18 @@ def store_catalog(catalog: Catalog): keys[coll_db.identifier] = coll_db.id if collection.resource: doc = Document(collection.filepath) - references = { - tree: [ref.json() for ref in obj.find_refs(doc.xml, structure=obj.units)] - for tree, obj in doc.citeStructure.items() - } - paths = {key: generate_paths(tree) for key, tree in references.items()} - nav = Navigation(collection_id=coll_db.id, paths=paths, references=references) - db.session.add(nav) + if doc.citeStructure: + references = { + tree: [ref.json() for ref in obj.find_refs(doc.xml, structure=obj.units)] + for tree, obj in doc.citeStructure.items() + } + paths = {key: generate_paths(tree) for key, tree in references.items()} + nav = Navigation(collection_id=coll_db.id, paths=paths, references=references) + db.session.add(nav) + coll_db.citeStructure = { + key: value.units.json() + for key, value in doc.citeStructure.items() + } db.session.commit() for parent, child in catalog.relationships: @@ -169,4 +174,4 @@ def get_nav( catalog, _ = ingest_catalog("/home/thibault/dev/MyDapytains/tests/catalog/example-collection.xml") - store_catalog(catalog) \ No newline at end of file + store_catalog(catalog) diff --git a/dapitains/tei/citeStructure.py b/dapitains/tei/citeStructure.py index bdf2338..53bc9e5 100644 --- a/dapitains/tei/citeStructure.py +++ b/dapitains/tei/citeStructure.py @@ -38,6 +38,17 @@ def get(self, ref: str): return f"{self.match}[{self.use}='{ref}']" return f"{self.match}[{self.use}={ref}]" + def json(self): + out = { + "citeType": self.citeType, + } + if self.children: + out["citeStructure"] = [ + child.json() + for child in self.children + ] + return out + @dataclass class CitableUnit: From b55093838bff943a3c19ddef9fd05dd768e3578f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Mon, 26 Aug 2024 09:28:24 +0200 Subject: [PATCH 14/16] Adding get_refs and default_tree --- dapitains/app/ingest.py | 4 ++-- dapitains/tei/citeStructure.py | 2 +- dapitains/tei/document.py | 15 ++++++++++++--- tests/test_citeStructure.py | 6 +++--- tests/test_db_create.py | 27 ++++++++++++--------------- 5 files changed, 30 insertions(+), 24 deletions(-) diff --git a/dapitains/app/ingest.py b/dapitains/app/ingest.py index 04cce1e..2c5942c 100644 --- a/dapitains/app/ingest.py +++ b/dapitains/app/ingest.py @@ -17,14 +17,14 @@ def store_catalog(catalog: Catalog): doc = Document(collection.filepath) if doc.citeStructure: references = { - tree: [ref.json() for ref in obj.find_refs(doc.xml, structure=obj.units)] + tree: [ref.json() for ref in obj.find_refs(doc.xml, structure=obj.structure)] for tree, obj in doc.citeStructure.items() } paths = {key: generate_paths(tree) for key, tree in references.items()} nav = Navigation(collection_id=coll_db.id, paths=paths, references=references) db.session.add(nav) coll_db.citeStructure = { - key: value.units.json() + key: value.structure.json() for key, value in doc.citeStructure.items() } db.session.commit() diff --git a/dapitains/tei/citeStructure.py b/dapitains/tei/citeStructure.py index 53bc9e5..c82a964 100644 --- a/dapitains/tei/citeStructure.py +++ b/dapitains/tei/citeStructure.py @@ -102,7 +102,7 @@ def __init__(self, root: saxonlib.PyXdmNode): self.regex_pattern, cite_structure = self.build_regex_and_xpath( get_xpath_proc(self.root).evaluate_single("./citeStructure[1]") ) - self.units: CitableStructure = cite_structure + self.structure: CitableStructure = cite_structure def build_regex_and_xpath( self, diff --git a/dapitains/tei/document.py b/dapitains/tei/document.py index 6ed7299..8befbf0 100644 --- a/dapitains/tei/document.py +++ b/dapitains/tei/document.py @@ -231,13 +231,17 @@ def __init__(self, file_path: str): self.xml = PROCESSOR.parse_xml(xml_file_name=file_path) self.xpath_processor = get_xpath_proc(elem=self.xml) self.citeStructure: Dict[Optional[str], CiteStructureParser] = {} + + default = None for refsDecl in self.xpath_processor.evaluate("/TEI/teiHeader/refsDecl[./citeStructure]"): struct = CiteStructureParser(refsDecl) - self.citeStructure[refsDecl.get_attribute_value("n")] = struct + self.citeStructure[refsDecl.get_attribute_value("n") or "default"] = struct + + if refsDecl.get_attribute_value("default") == "true" or default is None: + default = refsDecl.get_attribute_value("n") or "default" - if refsDecl.get_attribute_value("default") == "true": - self.citeStructure[None] = struct + self.default_tree: str = default def get_passage(self, ref_or_start: Optional[str], end: Optional[str] = None, tree: Optional[str] = None) -> Element: """ Retrieve a given passage from the document @@ -255,6 +259,7 @@ def get_passage(self, ref_or_start: Optional[str], end: Optional[str] = None, tr else: raise ValueError("Start/End or Ref are necessary to get a passage") + tree = tree or self.default_tree try: start = self.citeStructure[tree].generate_xpath(start) except KeyError: @@ -278,3 +283,7 @@ def xpath_split(string: str) -> List[str]: ) objectify.deannotate(root, cleanup_namespaces=True) return root + + def get_reffs(self, tree: Optional[str] = None): + tree = self.citeStructure[tree or self.default_tree] + return tree.find_refs(root=self.xml, structure=tree.structure) diff --git a/tests/test_citeStructure.py b/tests/test_citeStructure.py index 0dca40d..e3ae623 100644 --- a/tests/test_citeStructure.py +++ b/tests/test_citeStructure.py @@ -58,7 +58,7 @@ def test_parsing(): # Generate XPath for "Luke 1" (partial match) assert parser.generate_xpath("Luke") == "//body/div[@n='Luke']" - assert [root.json() for root in parser.find_refs(root=TEI, structure=parser.units)] == [ + assert [root.json() for root in parser.find_refs(root=TEI, structure=parser.structure)] == [ {'citeType': 'book', 'ref': 'Luke', 'parent': None, 'level': 1, 'members': [ {'citeType': 'chapter', 'ref': 'Luke 1', 'parent': 'Luke', 'level': 2, 'members': [ {'citeType': 'verse', 'ref': 'Luke 1:1', 'parent': 'Luke 1', 'level': 3}, @@ -82,7 +82,7 @@ def test_cite_data(): xpath = get_xpath_proc(elem=TEI) citeStructure = xpath.evaluate_single("/TEI/teiHeader/refsDecl[1]") parser = CiteStructureParser(citeStructure) - refs = parser.find_refs(root=TEI, structure=parser.units) + refs = parser.find_refs(root=TEI, structure=parser.structure) refs = [ref.json() for ref in refs] assert refs == [ {'citeType': 'book', 'ref': '1', 'parent': None, 'level': 1, 'dublinCore': { @@ -106,7 +106,7 @@ def test_advanced_cite_data(): xpath = get_xpath_proc(elem=TEI) citeStructure = xpath.evaluate_single("/TEI/teiHeader/refsDecl[1]") parser = CiteStructureParser(citeStructure) - refs = parser.find_refs(root=TEI, structure=parser.units) + refs = parser.find_refs(root=TEI, structure=parser.structure) refs = [ref.json() for ref in refs] assert refs == [ {'citeType': 'part', 'ref': 'part-1', 'parent': None, 'level': 1, 'members': [ diff --git a/tests/test_db_create.py b/tests/test_db_create.py index edf4080..905d239 100644 --- a/tests/test_db_create.py +++ b/tests/test_db_create.py @@ -11,7 +11,7 @@ def test_simple_path(): """Check that a document can be parsed and that path are corrects""" doc = Document(f"{local_dir}/tei/multiple_tree.xml") refs = { - tree: [ref.json() for ref in obj.find_refs(doc.xml, structure=obj.units)] + tree: [ref.json() for ref in doc.get_reffs(tree)] for tree, obj in doc.citeStructure.items() } paths = {tree: generate_paths(ref) for tree, ref in refs.items()} @@ -19,9 +19,6 @@ def test_simple_path(): 'nums': { 'I': [0], '1': [1], 'A': [2], '4': [3], 'V': [4] }, - None: { - 'I': [0], '1': [1], 'A': [2], '4': [3], 'V': [4] - }, 'alpha': { 'div-a1': [0], 'div-002': [1], 'div-xyz': [2], 'div-004': [3], 'div-v5': [4] } @@ -29,12 +26,12 @@ def test_simple_path(): # Second part of the test doc = Document(f"{local_dir}/tei/base_tei.xml") refs = { - tree: [ref.json() for ref in obj.find_refs(doc.xml, structure=obj.units)] + tree: [ref.json() for ref in doc.get_reffs(tree)] for tree, obj in doc.citeStructure.items() } paths = {tree: generate_paths(ref) for tree, ref in refs.items()} assert paths == { - None: { + "default": { "Luke": [0], "Luke 1": [0, 0], "Luke 1:1": [0, 0, 0], @@ -49,27 +46,27 @@ def test_simple_path(): } } assert strip_members( - get_member_by_path(refs[None], paths[None]["Luke"]) + get_member_by_path(refs[doc.default_tree], paths[doc.default_tree]["Luke"]) ) == {'citeType': 'book', 'ref': 'Luke', "level": 1, "parent": None}, "Check that members are stripped" assert get_member_by_path( - refs[None], paths[None]["Mark 1:3"] + refs[doc.default_tree], paths[doc.default_tree]["Mark 1:3"] ) == {'citeType': 'verse', 'ref': 'Mark 1:3', "level": 3, "parent": "Mark 1"} def test_navigation(): doc = Document(f"{local_dir}/tei/base_tei.xml") refs = { - tree: [ref.json() for ref in obj.find_refs(doc.xml, structure=obj.units)] + tree: [ref.json() for ref in obj.find_refs(doc.xml, structure=obj.structure)] for tree, obj in doc.citeStructure.items() } paths = {tree: generate_paths(ref) for tree, ref in refs.items()} - assert get_nav(refs[None], paths[None], start_or_ref=None, end=None, down=1) == ([ + assert get_nav(refs[doc.default_tree], paths[doc.default_tree], start_or_ref=None, end=None, down=1) == ([ {'citeType': 'book', 'ref': 'Luke', "level": 1, "parent": None}, {'citeType': 'book', 'ref': 'Mark', "level": 1, "parent": None} ], None, None), "Check that base function works" - assert get_nav(refs[None], paths[None], start_or_ref="Luke 1:1", end="Luke 1#1", down=0) == ( + assert get_nav(refs[doc.default_tree], paths[doc.default_tree], start_or_ref="Luke 1:1", end="Luke 1#1", down=0) == ( [ {'citeType': 'verse', 'ref': 'Luke 1:1', "level": 3, "parent": "Luke 1"}, {'citeType': 'verse', 'ref': 'Luke 1:2', "level": 3, "parent": "Luke 1"}, @@ -79,7 +76,7 @@ def test_navigation(): {'citeType': 'bloup', 'ref': 'Luke 1#1', "level": 3, "parent": "Luke 1"} ), "Check that ?start/end works" - assert get_nav(refs[None], paths[None], start_or_ref="Luke 1:1", end="Mark 1:2", down=0) == ( + assert get_nav(refs[doc.default_tree], paths[doc.default_tree], start_or_ref="Luke 1:1", end="Mark 1:2", down=0) == ( [ {'citeType': 'verse', 'ref': 'Luke 1:1', "level": 3, "parent": "Luke 1"}, {'citeType': 'verse', 'ref': 'Luke 1:2', "level": 3, "parent": "Luke 1"}, @@ -91,7 +88,7 @@ def test_navigation(): {'citeType': 'verse', 'ref': 'Mark 1:2', "level": 3, "parent": "Mark 1"} ), "Check that ?start/end works across parents" - assert get_nav(refs[None], paths[None], start_or_ref="Luke 1", down=1) == ( + assert get_nav(refs[doc.default_tree], paths[doc.default_tree], start_or_ref="Luke 1", down=1) == ( [ {'citeType': 'verse', 'ref': 'Luke 1:1', "level": 3, "parent": "Luke 1"}, {'citeType': 'verse', 'ref': 'Luke 1:2', "level": 3, "parent": "Luke 1"}, @@ -101,7 +98,7 @@ def test_navigation(): None ), "Check that ?ref works" - assert get_nav(refs[None], paths[None], start_or_ref="Luke", down=1) == ( + assert get_nav(refs[doc.default_tree], paths[doc.default_tree], start_or_ref="Luke", down=1) == ( [ {'citeType': 'chapter', 'ref': 'Luke 1', "level": 2, "parent": "Luke"}, ], @@ -109,7 +106,7 @@ def test_navigation(): None ), "Check that ?ref works" - assert get_nav(refs[None], paths[None], start_or_ref=None, end=None, down=2) == ( + assert get_nav(refs[doc.default_tree], paths[doc.default_tree], start_or_ref=None, end=None, down=2) == ( [ {'citeType': 'book', 'ref': 'Luke', "level": 1, "parent": None}, {'citeType': 'chapter', 'ref': 'Luke 1', "level": 2, "parent": "Luke"}, From 658b5653fdc6d90f5991b914b686044045536a31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Mon, 26 Aug 2024 09:36:58 +0200 Subject: [PATCH 15/16] Moving things around --- dapitains/app/app.py | 6 +- dapitains/app/ingest.py | 132 +++---------------------------- dapitains/app/navigation.py | 118 +++++++++++++++++++++++++++ dapitains/metadata/xml_parser.py | 19 ++--- tests/test_catalog.py | 4 +- tests/test_db_create.py | 2 +- 6 files changed, 144 insertions(+), 137 deletions(-) create mode 100644 dapitains/app/navigation.py diff --git a/dapitains/app/app.py b/dapitains/app/app.py index c039e64..4848b50 100644 --- a/dapitains/app/app.py +++ b/dapitains/app/app.py @@ -16,7 +16,7 @@ import json from dapitains.app.database import db, Collection, Navigation -from dapitains.app.ingest import get_nav +from dapitains.app.navigation import get_nav def msg_4xx(string, code=404) -> Response: @@ -162,7 +162,7 @@ def navigation_route(): if __name__ == "__main__": import os from dapitains.app.ingest import store_catalog - from dapitains.metadata.xml_parser import ingest_catalog + from dapitains.metadata.xml_parser import parse app = Flask(__name__) _, db = create_app(app) @@ -177,7 +177,7 @@ def navigation_route(): db.drop_all() db.create_all() - catalog, _ = ingest_catalog(f"{basedir}/../../tests/catalog/example-collection.xml") + catalog, _ = parse(f"{basedir}/../../tests/catalog/example-collection.xml") store_catalog(catalog) app.run() diff --git a/dapitains/app/ingest.py b/dapitains/app/ingest.py index 2c5942c..929593f 100644 --- a/dapitains/app/ingest.py +++ b/dapitains/app/ingest.py @@ -1,13 +1,13 @@ -from typing import Dict, List, Optional, Any, Tuple +from typing import Dict, Optional from dapitains.app.database import Collection, Navigation, db, parent_child_association +from dapitains.app.navigation import generate_paths from dapitains.metadata.xml_parser import Catalog from dapitains.tei.document import Document -import copy import tqdm -def store_catalog(catalog: Catalog): - keys = {} +def store_single(catalog: Catalog, keys: Optional[Dict[str, int]]): + keys = keys or {} for identifier, collection in tqdm.tqdm(catalog.objects.items(), desc="Parsing all collections"): coll_db = Collection.from_class(collection) db.session.add(coll_db) @@ -38,128 +38,16 @@ def store_catalog(catalog: Catalog): db.session.commit() - -def get_member_by_path(data: List[Dict[str, Any]], path: List[int]) -> Optional[Dict[str, Any]]: - """ - Retrieve the member at the specified path in the nested data structure. - - :param data: The nested data structure (list of dictionaries). - :param path: A list of indices that represent the path to the desired member. - :return: The member at the specified path, or None if the path is invalid. - """ - current_level = data - - path_copy = [] + path - while path_copy: - index = path_copy.pop(0) - try: - current_level = current_level[index] - if 'members' in current_level and path_copy: - current_level = current_level['members'] - except (IndexError, KeyError): - return None - - return current_level - - -def strip_members(obj: Dict[str, Any]) -> Dict[str, Any]: - return {k: v for k, v in obj.items() if k != "members"} - - -def generate_paths(data: List[Dict[str, Any]], path: Optional[List[int]] = None) -> Dict[str, List[int]]: - """ - Generate a dictionary mapping each 'ref' in a nested data structure to its path. - - The path is represented as a list of indices that show how to access each 'ref' - in the nested structure. - - :param data: The nested data structure (list of dictionaries). Each dictionary - can have a 'ref' and/or 'members' key. - :param path: A list of indices representing the current path in the nested data - structure. Used internally for recursion. Defaults to None for the - initial call. - :return: A dictionary where each key is a 'ref' and each value is a list of indices - representing the path to that 'ref' in the nested structure. - """ - if path is None: - path = [] - - paths = {} - - def recurse(items, current_path): - for index, item in enumerate(items): - ref = item.get('ref') - if ref: - # Record the path for the current reference - paths[ref] = current_path + [index] - - members = item.get('members') - if members: - # Recurse into the 'members' list - recurse(members, current_path + [index]) - - recurse(data, []) - return paths - - -def get_nav( - refs: List[Dict[str, Any]], - paths: Dict[str, List[int]], - start_or_ref: Optional[str] = None, - end: Optional[str] = None, - down: Optional[int] = 1 -) -> Tuple[List[Dict[str, Any]], Optional[Dict[str, Any]], Optional[Dict[str, Any]]]: - """ Given a references set and a path set, provide the CitableUnit from start to end at down level. - - """ - - paths_index = list(paths.keys()) - start_index, end_index = None, None - if end: - end_index = paths_index.index(end) + 1 - if start_or_ref: - start_index = paths_index.index(start_or_ref) - if not end: - for index, reference in enumerate(paths_index[start_index+1:]): - if len(paths[start_or_ref]) == len(paths[reference]): - end_index = index + start_index + 1 - - paths = dict(list(paths.items())[start_index:end_index]) - - current_level = [0] - - start_path, end_path = None, None - - if start_or_ref: - start_path = paths[start_or_ref] - current_level.append(len(start_path)) - if end: - end_path = paths[end] - current_level.append(len(end_path)) - - current_level = max(current_level) - - if down == -1: - down = max(list(map(len, paths.values()))) - - if down == 0: - paths = {key: value for key, value in paths.items() if len(value) == current_level} - else: - paths = {key: value for key, value in paths.items() if current_level < len(value) <= down + current_level} - - return ( - [ - strip_members(get_member_by_path(refs, path)) for path in paths.values() - ], - strip_members(get_member_by_path(refs, start_path)) if start_path else None, - strip_members(get_member_by_path(refs, end_path)) if end_path else None - ) +def store_catalog(*catalogs): + keys = {} + for catalog in catalogs: + store_single(catalog, keys) if __name__ == "__main__": import flask import os - from dapitains.metadata.xml_parser import ingest_catalog + from dapitains.metadata.xml_parser import parse app = flask.Flask(__name__) basedir = os.path.abspath(os.path.dirname(__file__)) @@ -172,6 +60,6 @@ def get_nav( db.drop_all() db.create_all() - catalog, _ = ingest_catalog("/home/thibault/dev/MyDapytains/tests/catalog/example-collection.xml") + catalog, _ = parse("/home/thibault/dev/MyDapytains/tests/catalog/example-collection.xml") store_catalog(catalog) diff --git a/dapitains/app/navigation.py b/dapitains/app/navigation.py new file mode 100644 index 0000000..0914847 --- /dev/null +++ b/dapitains/app/navigation.py @@ -0,0 +1,118 @@ +from typing import List, Dict, Any, Optional, Tuple + + +def get_member_by_path(data: List[Dict[str, Any]], path: List[int]) -> Optional[Dict[str, Any]]: + """ + Retrieve the member at the specified path in the nested data structure. + + :param data: The nested data structure (list of dictionaries). + :param path: A list of indices that represent the path to the desired member. + :return: The member at the specified path, or None if the path is invalid. + """ + current_level = data + + path_copy = [] + path + while path_copy: + index = path_copy.pop(0) + try: + current_level = current_level[index] + if 'members' in current_level and path_copy: + current_level = current_level['members'] + except (IndexError, KeyError): + return None + + return current_level + + +def strip_members(obj: Dict[str, Any]) -> Dict[str, Any]: + return {k: v for k, v in obj.items() if k != "members"} + + +def generate_paths(data: List[Dict[str, Any]], path: Optional[List[int]] = None) -> Dict[str, List[int]]: + """ + Generate a dictionary mapping each 'ref' in a nested data structure to its path. + + The path is represented as a list of indices that show how to access each 'ref' + in the nested structure. + + :param data: The nested data structure (list of dictionaries). Each dictionary + can have a 'ref' and/or 'members' key. + :param path: A list of indices representing the current path in the nested data + structure. Used internally for recursion. Defaults to None for the + initial call. + :return: A dictionary where each key is a 'ref' and each value is a list of indices + representing the path to that 'ref' in the nested structure. + """ + if path is None: + path = [] + + paths = {} + + def recurse(items, current_path): + for index, item in enumerate(items): + ref = item.get('ref') + if ref: + # Record the path for the current reference + paths[ref] = current_path + [index] + + members = item.get('members') + if members: + # Recurse into the 'members' list + recurse(members, current_path + [index]) + + recurse(data, []) + return paths + + +def get_nav( + refs: List[Dict[str, Any]], + paths: Dict[str, List[int]], + start_or_ref: Optional[str] = None, + end: Optional[str] = None, + down: Optional[int] = 1 +) -> Tuple[List[Dict[str, Any]], Optional[Dict[str, Any]], Optional[Dict[str, Any]]]: + """ Given a references set and a path set, provide the CitableUnit from start to end at down level. + + """ + + paths_index = list(paths.keys()) + start_index, end_index = None, None + if end: + end_index = paths_index.index(end) + 1 + if start_or_ref: + start_index = paths_index.index(start_or_ref) + if not end: + for index, reference in enumerate(paths_index[start_index+1:]): + if len(paths[start_or_ref]) == len(paths[reference]): + end_index = index + start_index + 1 + + paths = dict(list(paths.items())[start_index:end_index]) + + current_level = [0] + + start_path, end_path = None, None + + if start_or_ref: + start_path = paths[start_or_ref] + current_level.append(len(start_path)) + if end: + end_path = paths[end] + current_level.append(len(end_path)) + + current_level = max(current_level) + + if down == -1: + down = max(list(map(len, paths.values()))) + + if down == 0: + paths = {key: value for key, value in paths.items() if len(value) == current_level} + else: + paths = {key: value for key, value in paths.items() if current_level < len(value) <= down + current_level} + + return ( + [ + strip_members(get_member_by_path(refs, path)) for path in paths.values() + ], + strip_members(get_member_by_path(refs, start_path)) if start_path else None, + strip_members(get_member_by_path(refs, end_path)) if end_path else None + ) diff --git a/dapitains/metadata/xml_parser.py b/dapitains/metadata/xml_parser.py index c33c8a4..fe6c5e8 100644 --- a/dapitains/metadata/xml_parser.py +++ b/dapitains/metadata/xml_parser.py @@ -6,6 +6,9 @@ from dapitains.metadata.classes import DublinCore, Extension, Collection +__all__ = ["Catalog", "parse"] + + _re_tag = re.compile(r"[{}]") @@ -15,7 +18,7 @@ class Catalog: objects: Dict[str, Collection] = field(default_factory=dict) -def parse_metadata(xml: ET.Element) -> Tuple[Dict[str, Any], List[str]]: +def _parse_metadata(xml: ET.Element) -> Tuple[Dict[str, Any], List[str]]: """ Parse Metadata :param xml: Collection/Resource tag @@ -54,14 +57,14 @@ def parse_metadata(xml: ET.Element) -> Tuple[Dict[str, Any], List[str]]: return obj, parents -def parse_collection(xml: ET.Element, basedir: str, tree: Catalog) -> Collection: +def _parse_collection(xml: ET.Element, basedir: str, tree: Catalog) -> Collection: """ Parse a Collection or Resource object :param xml: Parsed Collection or Resource by LXML :param basedir: Directory used to resolve filepath, that are relative to the main object :param tree: Catalog that is updated with objects. """ - obj, parents = parse_metadata(xml) + obj, parents = _parse_metadata(xml) obj = Collection(**obj, resource=xml.tag == "resource") for parent in parents: tree.relationships.append((parent, obj.identifier)) @@ -70,28 +73,26 @@ def parse_collection(xml: ET.Element, basedir: str, tree: Catalog) -> Collection obj.filepath = os.path.normpath(os.path.join(basedir, xml.attrib["filepath"])) for member in xml.xpath("./members/*"): if member.xpath("./title"): - child = parse_collection(member, basedir, tree) + child = _parse_collection(member, basedir, tree) tree.relationships.append((obj.identifier, child.identifier)) else: - _, child = ingest_catalog(os.path.join(basedir, member.attrib["filepath"]), tree) + _, child = parse(os.path.join(basedir, member.attrib["filepath"]), tree) tree.relationships.append((obj.identifier, child.identifier)) return obj -def ingest_catalog(path: str, tree: Optional[Catalog] = None) -> Tuple[Catalog, Collection]: +def parse(path: str, tree: Optional[Catalog] = None) -> Tuple[Catalog, Collection]: """ Ingest a collection description file. :param path: Path to a Collection XML File, see the schema at tests/catalog/schema.rng :param tree: Current catalog, which is either updated or created :return: Catalog and root collection found at path. - >>> ingest_catalog("../../tests/catalog/example-collection.xml") """ xml = ET.parse(path) current_dir = os.path.abspath(os.path.dirname(path)) root: ET.Element = xml.getroot() tree = tree or Catalog() - root_collection = parse_collection(root, basedir=current_dir, tree=tree) + root_collection = _parse_collection(root, basedir=current_dir, tree=tree) return tree, root_collection - diff --git a/tests/test_catalog.py b/tests/test_catalog.py index c376624..145ac77 100644 --- a/tests/test_catalog.py +++ b/tests/test_catalog.py @@ -1,6 +1,6 @@ import os.path -from dapitains.metadata.xml_parser import ingest_catalog +from dapitains.metadata.xml_parser import parse from dapitains.metadata.classes import * @@ -8,7 +8,7 @@ def test_ingestion(): - tree, _ = ingest_catalog(f"{local_dir}/catalog/example-collection.xml") + tree, _ = parse(f"{local_dir}/catalog/example-collection.xml") assert tree.objects == { "https://foo.bar/default": Collection( diff --git a/tests/test_db_create.py b/tests/test_db_create.py index 905d239..72984f1 100644 --- a/tests/test_db_create.py +++ b/tests/test_db_create.py @@ -1,5 +1,5 @@ import flask -from dapitains.app.ingest import generate_paths, get_member_by_path, get_nav, strip_members +from dapitains.app.navigation import get_member_by_path, strip_members, generate_paths, get_nav from dapitains.tei.document import Document import os From b02fc3c14fce815e71f3d7a85aac0d9a50c61010 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Mon, 26 Aug 2024 09:54:26 +0200 Subject: [PATCH 16/16] citationTrees correctly implemented --- dapitains/app/database.py | 38 ++++++++++++++++++++++++++++------- dapitains/app/ingest.py | 23 ++------------------- dapitains/metadata/classes.py | 4 ++-- tests/test_catalog.py | 8 ++++---- 4 files changed, 39 insertions(+), 34 deletions(-) diff --git a/dapitains/app/database.py b/dapitains/app/database.py index 954c8c6..06a54a5 100644 --- a/dapitains/app/database.py +++ b/dapitains/app/database.py @@ -1,3 +1,5 @@ +from collections import defaultdict + try: from flask_sqlalchemy import SQLAlchemy from sqlalchemy.ext.mutable import MutableDict, Mutable @@ -10,8 +12,6 @@ from typing import Optional, Dict, Any import dapitains.metadata.classes as abstracts -from dapitains.metadata.xml_parser import Catalog -from dapitains.tei.document import Document import json @@ -59,6 +59,7 @@ class Collection(db.Model): dublin_core = db.Column(JSONEncoded, nullable=True) extensions = db.Column(JSONEncoded, nullable=True) citeStructure = db.Column(JSONEncoded, nullable=True) + default_tree = db.Column(db.String, nullable=True) # One-to-one relationship with Navigation navigation = db.relationship('Navigation', uselist=False, backref='collection', lazy=True) @@ -92,8 +93,16 @@ def json(self, inject: Optional[Dict[str, Any]] = None): } if self.description: data["description"] = self.description + if self.resource: + data["citationTrees"] = [] if self.citeStructure: - data["citeStructure"] = self.citeStructure + data["citationTrees"] = [self.citeStructure[self.default_tree]] + if len(self.citeStructure) >= 1: + data["citationTrees"][0]["identifier"] = self.default_tree + for key in self.citeStructure: + if key != self.default_tree: + data["citationTrees"].append(self.citeStructure[key]) + self.citeStructure[key]["identifier"] = key if self.dublin_core: # ToDo: Fix the way it's presented to adapt to dts view data["dublinCore"] = self.dublin_core if self.extensions: @@ -103,16 +112,32 @@ def json(self, inject: Optional[Dict[str, Any]] = None): @classmethod def from_class(cls, obj: abstracts.Collection) -> "Collection": - return cls( + dublin_core = defaultdict(list) + for dublin in obj.dublin_core: + if dublin.language: + dublin_core[dublin.term].append({"lang": dublin.language, "value": dublin.value}) + else: + dublin_core[dublin.term].append(dublin.value) + + extensions = defaultdict(list) + for exte in obj.extensions: + if exte.language: + extensions[exte.term].append({"lang": exte.language, "value": exte.value}) + else: + extensions[exte.term].append(exte.value) + + + obj = cls( identifier=obj.identifier, title=obj.title, description=obj.description, resource=obj.resource, filepath=obj.filepath, # We are dumping because it's not read or accessible - dublin_core=[dub.json() for dub in obj.dublin_core], - extensions=[ext.json() for ext in obj.extension] + dublin_core=dublin_core, #[dub.json() for dub in obj.dublin_core], + extensions=extensions, # [ext.json() for ext in obj.extension] ) + return obj class Navigation(db.Model): @@ -120,7 +145,6 @@ class Navigation(db.Model): id = db.Column(db.Integer, primary_key=True, autoincrement=True, nullable=False) collection_id = db.Column(db.Integer, db.ForeignKey('collections.id'), nullable=False, unique=True) - # default_tree = db.Column(db.String, nullable=True) # JSON fields stored as TEXT paths = db.Column(JSONEncoded, nullable=False, default={}) diff --git a/dapitains/app/ingest.py b/dapitains/app/ingest.py index 929593f..4c73cf1 100644 --- a/dapitains/app/ingest.py +++ b/dapitains/app/ingest.py @@ -27,6 +27,8 @@ def store_single(catalog: Catalog, keys: Optional[Dict[str, int]]): key: value.structure.json() for key, value in doc.citeStructure.items() } + coll_db.default_tree = doc.default_tree + db.session.add(coll_db) db.session.commit() for parent, child in catalog.relationships: @@ -42,24 +44,3 @@ def store_catalog(*catalogs): keys = {} for catalog in catalogs: store_single(catalog, keys) - - -if __name__ == "__main__": - import flask - import os - from dapitains.metadata.xml_parser import parse - app = flask.Flask(__name__) - - basedir = os.path.abspath(os.path.dirname(__file__)) - db_path = os.path.join(basedir, 'app.db') - app.config['SQLALCHEMY_DATABASE_URI'] = f'sqlite:///{db_path}' - app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False - - db.init_app(app) - with app.app_context(): - db.drop_all() - db.create_all() - - catalog, _ = parse("/home/thibault/dev/MyDapytains/tests/catalog/example-collection.xml") - - store_catalog(catalog) diff --git a/dapitains/metadata/classes.py b/dapitains/metadata/classes.py index 09de602..90bec08 100644 --- a/dapitains/metadata/classes.py +++ b/dapitains/metadata/classes.py @@ -33,7 +33,7 @@ class Collection: title: str description: Optional[str] = None dublin_core: List[DublinCore] = field(default_factory=list) - extension: List[Extension] = field(default_factory=list) + extensions: List[Extension] = field(default_factory=list) resource: bool = False filepath: Optional[str] = None @@ -43,7 +43,7 @@ def json(self): "title": self.title, "description": self.description, "dublin_core": self.dublin_core, - "extension": self.extension, + "extension": self.extensions, "resource": self.resource, "filepath": self.filepath } diff --git a/tests/test_catalog.py b/tests/test_catalog.py index 145ac77..6602b2f 100644 --- a/tests/test_catalog.py +++ b/tests/test_catalog.py @@ -16,7 +16,7 @@ def test_ingestion(): title='A collection', description=None, dublin_core=[ DublinCore(term='abstract', value='This is a perfect example of an absract.', language=None), - DublinCore(term='abstract', value='Et je peux traduire en français', language='fr')], extension=[], + DublinCore(term='abstract', value='Et je peux traduire en français', language='fr')], extensions=[], resource=False, filepath=None ), @@ -29,7 +29,7 @@ def test_ingestion(): DublinCore(term='subject', value='History', language=None), DublinCore(term='date', value='2023-08-24', language=None) ], - extension=[], + extensions=[], resource=False, filepath=None ), @@ -41,7 +41,7 @@ def test_ingestion(): DublinCore(term='subject', value='World War II', language=None), DublinCore(term='language', value='en', language=None) ], - extension=[], resource=True, + extensions=[], resource=True, filepath=os.path.abspath(f"{local_dir}/tei/multiple_tree.xml") ), "https://foo.bar/text": Collection( @@ -51,7 +51,7 @@ def test_ingestion(): dublin_core=[ DublinCore(term='title', value='A simple resource', language=None) ], - extension=[], + extensions=[], resource=True, filepath=os.path.abspath(f"{local_dir}/tei/base_tei.xml") )