diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f057f9a..64ef997 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -14,7 +14,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Display Python version diff --git a/.gitignore b/.gitignore index 7b6caf3..06fab00 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +app.db # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/dapitains/app/app.py b/dapitains/app/app.py new file mode 100644 index 0000000..4848b50 --- /dev/null +++ b/dapitains/app/app.py @@ -0,0 +1,183 @@ +from typing import Dict, Any, Optional + +from sqlalchemy.orm.collections import collection + +from tests.test_db_create import test_navigation + +try: + import uritemplate + from flask import Flask, request, Response + from flask_sqlalchemy import SQLAlchemy + import click +except ImportError: + print("This part of the package can only be imported with the web requirements.") + raise + +import json + +from dapitains.app.database import db, Collection, Navigation +from dapitains.app.navigation import get_nav + + +def msg_4xx(string, code=404) -> Response: + return Response(json.dumps({"message": string}), status=code, mimetype="application/json") + + +def collection_view( + identifier: Optional[str], + nav: str, + templates: Dict[str, uritemplate.URITemplate] +) -> Response: + """ Builds a collection view, regardless of how the parameters are received + + :param identifier: + :param nav: + :param templates: + """ + if not identifier: + coll: Collection = db.session.query(Collection).filter(~Collection.parents.any()).first() + else: + coll = Collection.query.where(Collection.identifier==identifier).first() + if coll is None: + return msg_4xx("Unknown collection") + out = coll.json() + + if nav == 'children': + related_collections = db.session.query(Collection).filter( + Collection.parents.any(id=coll.id) + ).all() + elif nav == 'parents': + related_collections = db.session.query(Collection).filter( + Collection.children.any(id=coll.id) + ).all() + else: + return msg_4xx(f"nav parameter has a wrong value {nav}", code=400) + + return Response(json.dumps({ + "@context": "https://distributed-text-services.github.io/specifications/context/1-alpha1.json", + "dtsVersion": "1-alpha", + **out, + "totalParents": coll.total_parents, + "totalChildren": coll.total_children, + "collection": templates["collection"].uri, + "member": [ + related.json( + inject=dict( + **{ + "collection": templates["collection"].partial({"id": related.identifier}).uri, + "document": templates["document"].partial({"resource": related.identifier}).uri, + }, + **( + { + "navigation": templates["navigation"].partial({"resource": related.identifier}).uri, + } if coll.citeStructure else {} + ) + ) if related.resource else related.json({ + "collection": templates["collection"].partial({"id": related.identifier}).uri + }) + ) + for related in related_collections + ] + }), mimetype="application/json", status=200) + + +def navigation_view(resource, ref, start, end, tree, down, templates: Dict[str, str]) -> Response: + if not resource: + return msg_4xx("Resource parameter was not provided") + + collection: Collection = Collection.query.where(Collection.identifier == resource).first() + if not collection: + return msg_4xx(f"Unknown resource `{resource}`") + + nav: Navigation = Navigation.query.where(Navigation.collection_id == collection.id).first() + if nav is None: + return msg_4xx(f"The resource `{resource}` does not support navigation") + + # Check for forbidden combinations + if ref or start or end: + if tree not in nav.references: + return msg_4xx(f"Unknown tree {tree} for resource `{resource}`") + elif ref and (start or end): + return msg_4xx(f"You cannot provide a ref parameter as well as start or end", code=400) + elif not ref and ((start and not end) or (end and not start)): + return msg_4xx(f"Range is missing one of its parameters (start or end)", code=400) + else: + if down is None: + return msg_4xx(f"The down query parameter is required when requesting without ref or start/end", code=400) + + refs = nav.references[tree] + paths = nav.paths[tree] + members, start, end = get_nav(refs=refs, paths=paths, start_or_ref=start or ref, end=end, down=down) + return Response(json.dumps({ + "@context": "https://distributed-text-services.github.io/specifications/context/1-alpha1.json", + "dtsVersion": "1-alpha", + "@type": "Navigation", + "@id": "https://example.org/api/dts/navigation/?resource=https://en.wikisource.org/wiki/Dracula&down=1", + "resource": collection.json(inject=templates), # To Do: implement and inject URI templates + "member": members + }), mimetype="application/json", status=200) + + +def create_app( + app: Flask, + use_query: bool = False +) -> (Flask, SQLAlchemy): + """ + + Initialisation of the DB is up to you + """ + navigation_template = uritemplate.URITemplate("/navigation/{?resource}{&ref,start,end,tree,down}") + collection_template = uritemplate.URITemplate("/collection/collection/{?id,nav}") + document_template = uritemplate.URITemplate("/document/{?resource}{&ref,start,end,tree}") + + @app.route("/collection/") + def collection_route(): + resource = request.args.get("id") + nav = request.args.get("nav", "children") + + return collection_view(resource, nav, templates={ + "navigation": navigation_template, + "collection": collection_template, + "document": document_template, + }) + + @app.route("/navigation/") + def navigation_route(): + resource = request.args.get("resource") + ref = request.args.get("ref") + start = request.args.get("start") + end = request.args.get("end") + tree = request.args.get("tree") + down = request.args.get("down", type=int, default=None) + + return navigation_view(resource, ref, start, end, tree, down, templates={ + "navigation": navigation_template.partial({"resource": resource}).uri, + "collection": collection_template.partial({"id": resource}).uri, + "document": document_template.partial({"resource": resource}).uri, + }) + + return app, db + + +if __name__ == "__main__": + import os + from dapitains.app.ingest import store_catalog + from dapitains.metadata.xml_parser import parse + + app = Flask(__name__) + _, db = create_app(app) + + basedir = os.path.abspath(os.path.dirname(__file__)) + db_path = os.path.join(basedir, 'app.db') + app.config['SQLALCHEMY_DATABASE_URI'] = f'sqlite:///{db_path}' + app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False + + db.init_app(app) + with app.app_context(): + db.drop_all() + db.create_all() + + catalog, _ = parse(f"{basedir}/../../tests/catalog/example-collection.xml") + store_catalog(catalog) + + app.run() diff --git a/dapitains/app/database.py b/dapitains/app/database.py new file mode 100644 index 0000000..06a54a5 --- /dev/null +++ b/dapitains/app/database.py @@ -0,0 +1,151 @@ +from collections import defaultdict + +try: + from flask_sqlalchemy import SQLAlchemy + from sqlalchemy.ext.mutable import MutableDict, Mutable + from sqlalchemy.types import TypeDecorator, TEXT + from sqlalchemy import func + import click +except ImportError: + print("This part of the package can only be imported with the web requirements.") + raise + +from typing import Optional, Dict, Any +import dapitains.metadata.classes as abstracts +import json + + +class CustomKeyJSONDecoder(json.JSONDecoder): + def __init__(self, *args, **kwargs): + super().__init__(object_hook=self.object_hook, *args, **kwargs) + + def object_hook(self, obj): + # Only convert 'None' string keys back to None + return {None if k == 'null' else k: v for k, v in obj.items()} + +db = SQLAlchemy() + +parent_child_association = db.Table('parent_child_association', + db.Column('parent_id', db.Integer, db.ForeignKey('collections.id'), primary_key=True), + db.Column('child_id', db.Integer, db.ForeignKey('collections.id'), primary_key=True) +) + + +class JSONEncoded(TypeDecorator): + """Enables JSON storage by encoding and decoding on the fly.""" + impl = TEXT + + def process_bind_param(self, value, dialect): + if value is None: + return None + else: + return json.dumps(value) + + def process_result_value(self, value, dialect): + if value is None: + return None + return json.loads(value, cls=CustomKeyJSONDecoder) + + +class Collection(db.Model): + __tablename__ = 'collections' + + id = db.Column(db.Integer, primary_key=True, autoincrement=True, nullable=False) + identifier = db.Column(db.String, nullable=False, unique=True) + title = db.Column(db.String, nullable=False) + description = db.Column(db.String, nullable=True) + resource = db.Column(db.Boolean, default=False) + filepath = db.Column(db.String, nullable=True) + dublin_core = db.Column(JSONEncoded, nullable=True) + extensions = db.Column(JSONEncoded, nullable=True) + citeStructure = db.Column(JSONEncoded, nullable=True) + default_tree = db.Column(db.String, nullable=True) + + # One-to-one relationship with Navigation + navigation = db.relationship('Navigation', uselist=False, backref='collection', lazy=True) + + parents = db.relationship( + 'Collection', + secondary=parent_child_association, + primaryjoin=id == parent_child_association.c.child_id, + secondaryjoin=id == parent_child_association.c.parent_id, + backref='children' + ) + + @property + def total_children(self): + return db.session.query(func.count(parent_child_association.c.child_id)).filter( + parent_child_association.c.parent_id == self.id + ).scalar() + + @property + def total_parents(self): + return db.session.query(func.count(parent_child_association.c.parent_id)).filter( + parent_child_association.c.child_id == self.id + ).scalar() + + def json(self, inject: Optional[Dict[str, Any]] = None): + data = { + "@type": "Resource" if self.resource else "Collection", + "@id": self.identifier, + "title": self.title, + **(inject or {}) + } + if self.description: + data["description"] = self.description + if self.resource: + data["citationTrees"] = [] + if self.citeStructure: + data["citationTrees"] = [self.citeStructure[self.default_tree]] + if len(self.citeStructure) >= 1: + data["citationTrees"][0]["identifier"] = self.default_tree + for key in self.citeStructure: + if key != self.default_tree: + data["citationTrees"].append(self.citeStructure[key]) + self.citeStructure[key]["identifier"] = key + if self.dublin_core: # ToDo: Fix the way it's presented to adapt to dts view + data["dublinCore"] = self.dublin_core + if self.extensions: + data["extensions"] = self.extensions + + return data + + @classmethod + def from_class(cls, obj: abstracts.Collection) -> "Collection": + dublin_core = defaultdict(list) + for dublin in obj.dublin_core: + if dublin.language: + dublin_core[dublin.term].append({"lang": dublin.language, "value": dublin.value}) + else: + dublin_core[dublin.term].append(dublin.value) + + extensions = defaultdict(list) + for exte in obj.extensions: + if exte.language: + extensions[exte.term].append({"lang": exte.language, "value": exte.value}) + else: + extensions[exte.term].append(exte.value) + + + obj = cls( + identifier=obj.identifier, + title=obj.title, + description=obj.description, + resource=obj.resource, + filepath=obj.filepath, + # We are dumping because it's not read or accessible + dublin_core=dublin_core, #[dub.json() for dub in obj.dublin_core], + extensions=extensions, # [ext.json() for ext in obj.extension] + ) + return obj + + +class Navigation(db.Model): + __tablename__ = 'navigations' + + id = db.Column(db.Integer, primary_key=True, autoincrement=True, nullable=False) + collection_id = db.Column(db.Integer, db.ForeignKey('collections.id'), nullable=False, unique=True) + + # JSON fields stored as TEXT + paths = db.Column(JSONEncoded, nullable=False, default={}) + references = db.Column(JSONEncoded, nullable=False, default={}) diff --git a/dapitains/app/ingest.py b/dapitains/app/ingest.py new file mode 100644 index 0000000..4c73cf1 --- /dev/null +++ b/dapitains/app/ingest.py @@ -0,0 +1,46 @@ +from typing import Dict, Optional +from dapitains.app.database import Collection, Navigation, db, parent_child_association +from dapitains.app.navigation import generate_paths +from dapitains.metadata.xml_parser import Catalog +from dapitains.tei.document import Document +import tqdm + + +def store_single(catalog: Catalog, keys: Optional[Dict[str, int]]): + keys = keys or {} + for identifier, collection in tqdm.tqdm(catalog.objects.items(), desc="Parsing all collections"): + coll_db = Collection.from_class(collection) + db.session.add(coll_db) + db.session.flush() + keys[coll_db.identifier] = coll_db.id + if collection.resource: + doc = Document(collection.filepath) + if doc.citeStructure: + references = { + tree: [ref.json() for ref in obj.find_refs(doc.xml, structure=obj.structure)] + for tree, obj in doc.citeStructure.items() + } + paths = {key: generate_paths(tree) for key, tree in references.items()} + nav = Navigation(collection_id=coll_db.id, paths=paths, references=references) + db.session.add(nav) + coll_db.citeStructure = { + key: value.structure.json() + for key, value in doc.citeStructure.items() + } + coll_db.default_tree = doc.default_tree + db.session.add(coll_db) + db.session.commit() + + for parent, child in catalog.relationships: + insert_statement = parent_child_association.insert().values( + parent_id=keys[parent], + child_id=keys[child] + ) + db.session.execute(insert_statement) + db.session.commit() + + +def store_catalog(*catalogs): + keys = {} + for catalog in catalogs: + store_single(catalog, keys) diff --git a/dapitains/app/navigation.py b/dapitains/app/navigation.py new file mode 100644 index 0000000..0914847 --- /dev/null +++ b/dapitains/app/navigation.py @@ -0,0 +1,118 @@ +from typing import List, Dict, Any, Optional, Tuple + + +def get_member_by_path(data: List[Dict[str, Any]], path: List[int]) -> Optional[Dict[str, Any]]: + """ + Retrieve the member at the specified path in the nested data structure. + + :param data: The nested data structure (list of dictionaries). + :param path: A list of indices that represent the path to the desired member. + :return: The member at the specified path, or None if the path is invalid. + """ + current_level = data + + path_copy = [] + path + while path_copy: + index = path_copy.pop(0) + try: + current_level = current_level[index] + if 'members' in current_level and path_copy: + current_level = current_level['members'] + except (IndexError, KeyError): + return None + + return current_level + + +def strip_members(obj: Dict[str, Any]) -> Dict[str, Any]: + return {k: v for k, v in obj.items() if k != "members"} + + +def generate_paths(data: List[Dict[str, Any]], path: Optional[List[int]] = None) -> Dict[str, List[int]]: + """ + Generate a dictionary mapping each 'ref' in a nested data structure to its path. + + The path is represented as a list of indices that show how to access each 'ref' + in the nested structure. + + :param data: The nested data structure (list of dictionaries). Each dictionary + can have a 'ref' and/or 'members' key. + :param path: A list of indices representing the current path in the nested data + structure. Used internally for recursion. Defaults to None for the + initial call. + :return: A dictionary where each key is a 'ref' and each value is a list of indices + representing the path to that 'ref' in the nested structure. + """ + if path is None: + path = [] + + paths = {} + + def recurse(items, current_path): + for index, item in enumerate(items): + ref = item.get('ref') + if ref: + # Record the path for the current reference + paths[ref] = current_path + [index] + + members = item.get('members') + if members: + # Recurse into the 'members' list + recurse(members, current_path + [index]) + + recurse(data, []) + return paths + + +def get_nav( + refs: List[Dict[str, Any]], + paths: Dict[str, List[int]], + start_or_ref: Optional[str] = None, + end: Optional[str] = None, + down: Optional[int] = 1 +) -> Tuple[List[Dict[str, Any]], Optional[Dict[str, Any]], Optional[Dict[str, Any]]]: + """ Given a references set and a path set, provide the CitableUnit from start to end at down level. + + """ + + paths_index = list(paths.keys()) + start_index, end_index = None, None + if end: + end_index = paths_index.index(end) + 1 + if start_or_ref: + start_index = paths_index.index(start_or_ref) + if not end: + for index, reference in enumerate(paths_index[start_index+1:]): + if len(paths[start_or_ref]) == len(paths[reference]): + end_index = index + start_index + 1 + + paths = dict(list(paths.items())[start_index:end_index]) + + current_level = [0] + + start_path, end_path = None, None + + if start_or_ref: + start_path = paths[start_or_ref] + current_level.append(len(start_path)) + if end: + end_path = paths[end] + current_level.append(len(end_path)) + + current_level = max(current_level) + + if down == -1: + down = max(list(map(len, paths.values()))) + + if down == 0: + paths = {key: value for key, value in paths.items() if len(value) == current_level} + else: + paths = {key: value for key, value in paths.items() if current_level < len(value) <= down + current_level} + + return ( + [ + strip_members(get_member_by_path(refs, path)) for path in paths.values() + ], + strip_members(get_member_by_path(refs, start_path)) if start_path else None, + strip_members(get_member_by_path(refs, end_path)) if end_path else None + ) diff --git a/dapitains/metadata/classes.py b/dapitains/metadata/classes.py index a77f694..90bec08 100644 --- a/dapitains/metadata/classes.py +++ b/dapitains/metadata/classes.py @@ -9,7 +9,10 @@ class DublinCore: language: Optional[str] = None def json(self): - return {"property": f"http://purl.org/dc/terms/{self.term}", "value": self.value, "language": self.language} + if self.language: + return {"property": f"http://purl.org/dc/terms/{self.term}", "value": self.value, "lang": self.language} + else: + return {"property": f"http://purl.org/dc/terms/{self.term}", "value": self.value} class Extension(DublinCore): @@ -18,7 +21,10 @@ class Extension(DublinCore): language: Optional[str] = None def json(self): - return {"property": self.term, "value": self.value, "language": self.language} + if self.language: + return {"property": self.term, "value": self.value, "language": self.language} + else: + return {"property": self.term, "value": self.value} @dataclass @@ -27,7 +33,7 @@ class Collection: title: str description: Optional[str] = None dublin_core: List[DublinCore] = field(default_factory=list) - extension: List[Extension] = field(default_factory=list) + extensions: List[Extension] = field(default_factory=list) resource: bool = False filepath: Optional[str] = None @@ -37,7 +43,7 @@ def json(self): "title": self.title, "description": self.description, "dublin_core": self.dublin_core, - "extension": self.extension, + "extension": self.extensions, "resource": self.resource, "filepath": self.filepath } diff --git a/dapitains/metadata/xml_parser.py b/dapitains/metadata/xml_parser.py index c33c8a4..fe6c5e8 100644 --- a/dapitains/metadata/xml_parser.py +++ b/dapitains/metadata/xml_parser.py @@ -6,6 +6,9 @@ from dapitains.metadata.classes import DublinCore, Extension, Collection +__all__ = ["Catalog", "parse"] + + _re_tag = re.compile(r"[{}]") @@ -15,7 +18,7 @@ class Catalog: objects: Dict[str, Collection] = field(default_factory=dict) -def parse_metadata(xml: ET.Element) -> Tuple[Dict[str, Any], List[str]]: +def _parse_metadata(xml: ET.Element) -> Tuple[Dict[str, Any], List[str]]: """ Parse Metadata :param xml: Collection/Resource tag @@ -54,14 +57,14 @@ def parse_metadata(xml: ET.Element) -> Tuple[Dict[str, Any], List[str]]: return obj, parents -def parse_collection(xml: ET.Element, basedir: str, tree: Catalog) -> Collection: +def _parse_collection(xml: ET.Element, basedir: str, tree: Catalog) -> Collection: """ Parse a Collection or Resource object :param xml: Parsed Collection or Resource by LXML :param basedir: Directory used to resolve filepath, that are relative to the main object :param tree: Catalog that is updated with objects. """ - obj, parents = parse_metadata(xml) + obj, parents = _parse_metadata(xml) obj = Collection(**obj, resource=xml.tag == "resource") for parent in parents: tree.relationships.append((parent, obj.identifier)) @@ -70,28 +73,26 @@ def parse_collection(xml: ET.Element, basedir: str, tree: Catalog) -> Collection obj.filepath = os.path.normpath(os.path.join(basedir, xml.attrib["filepath"])) for member in xml.xpath("./members/*"): if member.xpath("./title"): - child = parse_collection(member, basedir, tree) + child = _parse_collection(member, basedir, tree) tree.relationships.append((obj.identifier, child.identifier)) else: - _, child = ingest_catalog(os.path.join(basedir, member.attrib["filepath"]), tree) + _, child = parse(os.path.join(basedir, member.attrib["filepath"]), tree) tree.relationships.append((obj.identifier, child.identifier)) return obj -def ingest_catalog(path: str, tree: Optional[Catalog] = None) -> Tuple[Catalog, Collection]: +def parse(path: str, tree: Optional[Catalog] = None) -> Tuple[Catalog, Collection]: """ Ingest a collection description file. :param path: Path to a Collection XML File, see the schema at tests/catalog/schema.rng :param tree: Current catalog, which is either updated or created :return: Catalog and root collection found at path. - >>> ingest_catalog("../../tests/catalog/example-collection.xml") """ xml = ET.parse(path) current_dir = os.path.abspath(os.path.dirname(path)) root: ET.Element = xml.getroot() tree = tree or Catalog() - root_collection = parse_collection(root, basedir=current_dir, tree=tree) + root_collection = _parse_collection(root, basedir=current_dir, tree=tree) return tree, root_collection - diff --git a/dapitains/tei/citeStructure.py b/dapitains/tei/citeStructure.py index 79e866d..c82a964 100644 --- a/dapitains/tei/citeStructure.py +++ b/dapitains/tei/citeStructure.py @@ -38,6 +38,17 @@ def get(self, ref: str): return f"{self.match}[{self.use}='{ref}']" return f"{self.match}[{self.use}={ref}]" + def json(self): + out = { + "citeType": self.citeType, + } + if self.children: + out["citeStructure"] = [ + child.json() + for child in self.children + ] + return out + @dataclass class CitableUnit: @@ -47,15 +58,19 @@ class CitableUnit: node: Optional[saxonlib.PyXdmNode] = None dublinCore: Dict[str, List[str]] = field(default_factory=lambda: defaultdict(list)) extension: Dict[str, List[str]] = field(default_factory=lambda: defaultdict(list)) + level: int = 1 + parent: Optional[str] = None - def to_dts(self): + def json(self): out = { "citeType": self.citeType, - "ref": self.ref + "ref": self.ref, + "level": self.level, + "parent": self.parent } if self.children: out["members"] = [ - member.to_dts() + member.json() for member in self.children ] if self.dublinCore: @@ -87,7 +102,7 @@ def __init__(self, root: saxonlib.PyXdmNode): self.regex_pattern, cite_structure = self.build_regex_and_xpath( get_xpath_proc(self.root).evaluate_single("./citeStructure[1]") ) - self.units: CitableStructure = cite_structure + self.structure: CitableStructure = cite_structure def build_regex_and_xpath( self, @@ -189,26 +204,30 @@ def _dispatch( child_xpath: str, structure: CitableStructure, xpath_processor: saxonlib.PyXPathProcessor, - unit: CitableUnit): + unit: CitableUnit, + level: int): # target = self.generate_xpath(child.ref) if len(structure.children) == 1: self.find_refs( root=xpath_processor.evaluate_single(child_xpath), structure=structure.children[0], - unit=unit + unit=unit, + level=level ) else: self.find_refs_from_branches( root=xpath_processor.evaluate_single(child_xpath), structure=structure.children, - unit=unit + unit=unit, + level=level ) def find_refs( self, root: saxonlib.PyXdmNode, structure: CitableStructure = None, - unit: Optional[CitableUnit] = None + unit: Optional[CitableUnit] = None, + level: int = 1 ) -> List[CitableUnit]: xpath_proc = get_xpath_proc(elem=root) prefix = (unit.ref + structure.delim) if unit else "" @@ -218,7 +237,9 @@ def find_refs( for value in xpath_proc.evaluate(f"{xpath_prefix}{structure.xpath}"): child = CitableUnit( citeType=structure.citeType, - ref=f"{prefix}{value.string_value}" + ref=f"{prefix}{value.string_value}", + parent=unit.ref if unit else None, + level=level ) if structure.metadata: @@ -238,7 +259,8 @@ def find_refs( child_xpath=self.generate_xpath(child.ref), structure=structure, xpath_processor=xpath_proc, - unit=child + unit=child, + level=level+1 ) return units @@ -246,7 +268,8 @@ def find_refs_from_branches( self, root: saxonlib.PyXdmNode, structure: List[CitableStructure], - unit: Optional[CitableUnit] = None + unit: Optional[CitableUnit] = None, + level: int = 1 ) -> List[CitableUnit]: xpath_proc = get_xpath_proc(elem=root) prefix = (unit.ref) if unit else "" # ToDo: Reinject delim @@ -281,7 +304,9 @@ def compare_nodes_by_doc_order(node1, node2): for elem in unsorted: child_unit = CitableUnit( citeType=elem.struct.citeType, - ref=elem.citation + ref=elem.citation, + level=level, + parent=unit.ref if unit else None ) if unit: @@ -294,7 +319,8 @@ def compare_nodes_by_doc_order(node1, node2): child_xpath=self.generate_xpath(child_unit.ref), structure=elem.struct, xpath_processor=xpath_proc, - unit=child_unit + unit=child_unit, + level=level+1 ) return units diff --git a/dapitains/tei/tei.py b/dapitains/tei/document.py similarity index 96% rename from dapitains/tei/tei.py rename to dapitains/tei/document.py index 6ed7299..8befbf0 100644 --- a/dapitains/tei/tei.py +++ b/dapitains/tei/document.py @@ -231,13 +231,17 @@ def __init__(self, file_path: str): self.xml = PROCESSOR.parse_xml(xml_file_name=file_path) self.xpath_processor = get_xpath_proc(elem=self.xml) self.citeStructure: Dict[Optional[str], CiteStructureParser] = {} + + default = None for refsDecl in self.xpath_processor.evaluate("/TEI/teiHeader/refsDecl[./citeStructure]"): struct = CiteStructureParser(refsDecl) - self.citeStructure[refsDecl.get_attribute_value("n")] = struct + self.citeStructure[refsDecl.get_attribute_value("n") or "default"] = struct + + if refsDecl.get_attribute_value("default") == "true" or default is None: + default = refsDecl.get_attribute_value("n") or "default" - if refsDecl.get_attribute_value("default") == "true": - self.citeStructure[None] = struct + self.default_tree: str = default def get_passage(self, ref_or_start: Optional[str], end: Optional[str] = None, tree: Optional[str] = None) -> Element: """ Retrieve a given passage from the document @@ -255,6 +259,7 @@ def get_passage(self, ref_or_start: Optional[str], end: Optional[str] = None, tr else: raise ValueError("Start/End or Ref are necessary to get a passage") + tree = tree or self.default_tree try: start = self.citeStructure[tree].generate_xpath(start) except KeyError: @@ -278,3 +283,7 @@ def xpath_split(string: str) -> List[str]: ) objectify.deannotate(root, cleanup_namespaces=True) return root + + def get_reffs(self, tree: Optional[str] = None): + tree = self.citeStructure[tree or self.default_tree] + return tree.find_refs(root=self.xml, structure=tree.structure) diff --git a/requirements.txt b/requirements.txt index 980dd76..0a13a0c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,6 @@ saxonche==12.5.0 lxml flask flask-sqlalchemy -click \ No newline at end of file +click +uritemplate +tqdm \ No newline at end of file diff --git a/tests/test_catalog.py b/tests/test_catalog.py index c376624..6602b2f 100644 --- a/tests/test_catalog.py +++ b/tests/test_catalog.py @@ -1,6 +1,6 @@ import os.path -from dapitains.metadata.xml_parser import ingest_catalog +from dapitains.metadata.xml_parser import parse from dapitains.metadata.classes import * @@ -8,7 +8,7 @@ def test_ingestion(): - tree, _ = ingest_catalog(f"{local_dir}/catalog/example-collection.xml") + tree, _ = parse(f"{local_dir}/catalog/example-collection.xml") assert tree.objects == { "https://foo.bar/default": Collection( @@ -16,7 +16,7 @@ def test_ingestion(): title='A collection', description=None, dublin_core=[ DublinCore(term='abstract', value='This is a perfect example of an absract.', language=None), - DublinCore(term='abstract', value='Et je peux traduire en français', language='fr')], extension=[], + DublinCore(term='abstract', value='Et je peux traduire en français', language='fr')], extensions=[], resource=False, filepath=None ), @@ -29,7 +29,7 @@ def test_ingestion(): DublinCore(term='subject', value='History', language=None), DublinCore(term='date', value='2023-08-24', language=None) ], - extension=[], + extensions=[], resource=False, filepath=None ), @@ -41,7 +41,7 @@ def test_ingestion(): DublinCore(term='subject', value='World War II', language=None), DublinCore(term='language', value='en', language=None) ], - extension=[], resource=True, + extensions=[], resource=True, filepath=os.path.abspath(f"{local_dir}/tei/multiple_tree.xml") ), "https://foo.bar/text": Collection( @@ -51,7 +51,7 @@ def test_ingestion(): dublin_core=[ DublinCore(term='title', value='A simple resource', language=None) ], - extension=[], + extensions=[], resource=True, filepath=os.path.abspath(f"{local_dir}/tei/base_tei.xml") ) diff --git a/tests/test_citeStructure.py b/tests/test_citeStructure.py index 8154fd9..e3ae623 100644 --- a/tests/test_citeStructure.py +++ b/tests/test_citeStructure.py @@ -58,20 +58,21 @@ def test_parsing(): # Generate XPath for "Luke 1" (partial match) assert parser.generate_xpath("Luke") == "//body/div[@n='Luke']" - assert [root.to_dts() for root in parser.find_refs(root=TEI, structure=parser.units)] == [ - {'citeType': 'book', 'ref': 'Luke', 'members': [ - {'citeType': 'chapter', 'ref': 'Luke 1', 'members': [ - {'citeType': 'verse', 'ref': 'Luke 1:1'}, - {'citeType': 'verse', 'ref': 'Luke 1:2'}, - {'citeType': 'bloup', 'ref': 'Luke 1#1'} - ]} + assert [root.json() for root in parser.find_refs(root=TEI, structure=parser.structure)] == [ + {'citeType': 'book', 'ref': 'Luke', 'parent': None, 'level': 1, 'members': [ + {'citeType': 'chapter', 'ref': 'Luke 1', 'parent': 'Luke', 'level': 2, 'members': [ + {'citeType': 'verse', 'ref': 'Luke 1:1', 'parent': 'Luke 1', 'level': 3}, + {'citeType': 'verse', 'ref': 'Luke 1:2', 'parent': 'Luke 1', 'level': 3}, + {'citeType': 'bloup', 'ref': 'Luke 1#1', 'parent': 'Luke 1', 'level': 3} + ] + } ]}, - {'citeType': 'book', 'ref': 'Mark', 'members': [ - {'citeType': 'chapter', 'ref': 'Mark 1', 'members': [ - {'citeType': 'verse', 'ref': 'Mark 1:1'}, - {'citeType': 'verse', 'ref': 'Mark 1:2'}, - {'citeType': 'bloup', 'ref': 'Mark 1#1'}, - {'citeType': 'verse', 'ref': 'Mark 1:3'} + {'citeType': 'book', 'ref': 'Mark', 'parent': None, 'level': 1, 'members': [ + {'citeType': 'chapter', 'ref': 'Mark 1', 'parent': 'Mark', 'level': 2, 'members': [ + {'citeType': 'verse', 'ref': 'Mark 1:1', 'parent': 'Mark 1', 'level': 3}, + {'citeType': 'verse', 'ref': 'Mark 1:2', 'parent': 'Mark 1', 'level': 3}, + {'citeType': 'bloup', 'ref': 'Mark 1#1', 'parent': 'Mark 1', 'level': 3}, + {'citeType': 'verse', 'ref': 'Mark 1:3', 'parent': 'Mark 1', 'level': 3} ]} ]} ] @@ -81,20 +82,20 @@ def test_cite_data(): xpath = get_xpath_proc(elem=TEI) citeStructure = xpath.evaluate_single("/TEI/teiHeader/refsDecl[1]") parser = CiteStructureParser(citeStructure) - refs = parser.find_refs(root=TEI, structure=parser.units) - refs = [ref.to_dts() for ref in refs] + refs = parser.find_refs(root=TEI, structure=parser.structure) + refs = [ref.json() for ref in refs] assert refs == [ - {'citeType': 'book', 'ref': '1', 'dublinCore': { + {'citeType': 'book', 'ref': '1', 'parent': None, 'level': 1, 'dublinCore': { 'http://purl.org/dc/terms/title': ['Introduction', 'Introduction'], 'http://purl.org/dc/terms/creator': ['John Doe']}}, - {'citeType': 'book', 'ref': '2', 'dublinCore': {'http://purl.org/dc/terms/title': ["Background", 'Contexte']}}, - {'citeType': 'book', 'ref': '3', 'dublinCore': { + {'citeType': 'book', 'ref': '2', 'parent': None, 'level': 1, 'dublinCore': {'http://purl.org/dc/terms/title': ["Background", 'Contexte']}}, + {'citeType': 'book', 'ref': '3', 'parent': None, 'level': 1, 'dublinCore': { 'http://purl.org/dc/terms/title': ['Methodology', 'Méthodologie'], 'http://purl.org/dc/terms/creator': ['Albert Einstein']}}, - {'citeType': 'book', 'ref': '4', 'dublinCore': { + {'citeType': 'book', 'ref': '4', 'parent': None, 'level': 1, 'dublinCore': { 'http://purl.org/dc/terms/title': ['Results', 'Résultats'], 'http://purl.org/dc/terms/creator': ['Isaac Newton']}}, - {'citeType': 'book', 'ref': '5', 'dublinCore': { + {'citeType': 'book', 'ref': '5', 'parent': None, 'level': 1, 'dublinCore': { 'http://purl.org/dc/terms/title': ['Conclusion', 'Conclusion'], 'http://purl.org/dc/terms/creator': ['Marie Curie'] }}] @@ -105,27 +106,27 @@ def test_advanced_cite_data(): xpath = get_xpath_proc(elem=TEI) citeStructure = xpath.evaluate_single("/TEI/teiHeader/refsDecl[1]") parser = CiteStructureParser(citeStructure) - refs = parser.find_refs(root=TEI, structure=parser.units) - refs = [ref.to_dts() for ref in refs] + refs = parser.find_refs(root=TEI, structure=parser.structure) + refs = [ref.json() for ref in refs] assert refs == [ - {'citeType': 'part', 'ref': 'part-1', 'members': [ - {'citeType': 'book', 'ref': 'part-1.1', 'dublinCore': { + {'citeType': 'part', 'ref': 'part-1', 'parent': None, 'level': 1, 'members': [ + {'citeType': 'book', 'ref': 'part-1.1', 'parent': 'part-1', 'level': 2, 'dublinCore': { 'http://purl.org/dc/terms/title': ['Introduction', 'Introduction'], 'http://purl.org/dc/terms/creator': ['John Doe']}}, - {'citeType': 'book', 'ref': 'part-1.2', 'dublinCore': { + {'citeType': 'book', 'ref': 'part-1.2', 'parent': 'part-1', 'level': 2, 'dublinCore': { 'http://purl.org/dc/terms/title': ["Background", 'Contexte'] }} ], 'extension': {"http://foo.bar/part": ["1"]}}, - {'citeType': 'part', 'ref': 'part-2', 'members': [ - {'citeType': 'book', 'ref': 'part-2.3', 'dublinCore': { + {'citeType': 'part', 'ref': 'part-2', 'parent': None, 'level': 1, 'members': [ + {'citeType': 'book', 'ref': 'part-2.3', 'parent': 'part-2', 'level': 2, 'dublinCore': { 'http://purl.org/dc/terms/title': ['Methodology', 'Méthodologie'], 'http://purl.org/dc/terms/creator': ['Albert Einstein']}}, - {'citeType': 'book', 'ref': 'part-2.4', 'dublinCore': { + {'citeType': 'book', 'ref': 'part-2.4', 'parent': 'part-2', 'level': 2, 'dublinCore': { 'http://purl.org/dc/terms/title': ['Results', 'Résultats'], 'http://purl.org/dc/terms/creator': ['Isaac Newton']}} ], 'extension': {"http://foo.bar/part": ["2"]}}, - {'citeType': 'part', 'ref': 'part-3', 'members': [ - {'citeType': 'book', 'ref': 'part-3.5', 'dublinCore': { + {'citeType': 'part', 'ref': 'part-3', 'parent': None, 'level': 1, 'members': [ + {'citeType': 'book', 'ref': 'part-3.5', 'parent': 'part-3', 'level': 2, 'dublinCore': { 'http://purl.org/dc/terms/title': ['Conclusion', 'Conclusion'], 'http://purl.org/dc/terms/creator': ['Marie Curie'] }} diff --git a/tests/test_db_create.py b/tests/test_db_create.py new file mode 100644 index 0000000..72984f1 --- /dev/null +++ b/tests/test_db_create.py @@ -0,0 +1,118 @@ +import flask +from dapitains.app.navigation import get_member_by_path, strip_members, generate_paths, get_nav +from dapitains.tei.document import Document +import os + + +local_dir = os.path.join(os.path.dirname(__file__)) + + +def test_simple_path(): + """Check that a document can be parsed and that path are corrects""" + doc = Document(f"{local_dir}/tei/multiple_tree.xml") + refs = { + tree: [ref.json() for ref in doc.get_reffs(tree)] + for tree, obj in doc.citeStructure.items() + } + paths = {tree: generate_paths(ref) for tree, ref in refs.items()} + assert paths == { + 'nums': { + 'I': [0], '1': [1], 'A': [2], '4': [3], 'V': [4] + }, + 'alpha': { + 'div-a1': [0], 'div-002': [1], 'div-xyz': [2], 'div-004': [3], 'div-v5': [4] + } + } + # Second part of the test + doc = Document(f"{local_dir}/tei/base_tei.xml") + refs = { + tree: [ref.json() for ref in doc.get_reffs(tree)] + for tree, obj in doc.citeStructure.items() + } + paths = {tree: generate_paths(ref) for tree, ref in refs.items()} + assert paths == { + "default": { + "Luke": [0], + "Luke 1": [0, 0], + "Luke 1:1": [0, 0, 0], + "Luke 1:2": [0, 0, 1], + "Luke 1#1": [0, 0, 2], + "Mark": [1], + "Mark 1": [1, 0], + "Mark 1:1": [1, 0, 0], + "Mark 1:2": [1, 0, 1], + "Mark 1#1": [1, 0, 2], + "Mark 1:3": [1, 0, 3] + } + } + assert strip_members( + get_member_by_path(refs[doc.default_tree], paths[doc.default_tree]["Luke"]) + ) == {'citeType': 'book', 'ref': 'Luke', "level": 1, "parent": None}, "Check that members are stripped" + assert get_member_by_path( + refs[doc.default_tree], paths[doc.default_tree]["Mark 1:3"] + ) == {'citeType': 'verse', 'ref': 'Mark 1:3', "level": 3, "parent": "Mark 1"} + + +def test_navigation(): + doc = Document(f"{local_dir}/tei/base_tei.xml") + refs = { + tree: [ref.json() for ref in obj.find_refs(doc.xml, structure=obj.structure)] + for tree, obj in doc.citeStructure.items() + } + paths = {tree: generate_paths(ref) for tree, ref in refs.items()} + + assert get_nav(refs[doc.default_tree], paths[doc.default_tree], start_or_ref=None, end=None, down=1) == ([ + {'citeType': 'book', 'ref': 'Luke', "level": 1, "parent": None}, + {'citeType': 'book', 'ref': 'Mark', "level": 1, "parent": None} + ], None, None), "Check that base function works" + + assert get_nav(refs[doc.default_tree], paths[doc.default_tree], start_or_ref="Luke 1:1", end="Luke 1#1", down=0) == ( + [ + {'citeType': 'verse', 'ref': 'Luke 1:1', "level": 3, "parent": "Luke 1"}, + {'citeType': 'verse', 'ref': 'Luke 1:2', "level": 3, "parent": "Luke 1"}, + {'citeType': 'bloup', 'ref': 'Luke 1#1', "level": 3, "parent": "Luke 1"} + ], + {'citeType': 'verse', 'ref': 'Luke 1:1', "level": 3, "parent": "Luke 1"}, + {'citeType': 'bloup', 'ref': 'Luke 1#1', "level": 3, "parent": "Luke 1"} + ), "Check that ?start/end works" + + assert get_nav(refs[doc.default_tree], paths[doc.default_tree], start_or_ref="Luke 1:1", end="Mark 1:2", down=0) == ( + [ + {'citeType': 'verse', 'ref': 'Luke 1:1', "level": 3, "parent": "Luke 1"}, + {'citeType': 'verse', 'ref': 'Luke 1:2', "level": 3, "parent": "Luke 1"}, + {'citeType': 'bloup', 'ref': 'Luke 1#1', "level": 3, "parent": "Luke 1"}, + {'citeType': 'verse', 'ref': 'Mark 1:1', "level": 3, "parent": "Mark 1"}, + {'citeType': 'verse', 'ref': 'Mark 1:2', "level": 3, "parent": "Mark 1"} + ], + {'citeType': 'verse', 'ref': 'Luke 1:1', "level": 3, "parent": "Luke 1"}, + {'citeType': 'verse', 'ref': 'Mark 1:2', "level": 3, "parent": "Mark 1"} + ), "Check that ?start/end works across parents" + + assert get_nav(refs[doc.default_tree], paths[doc.default_tree], start_or_ref="Luke 1", down=1) == ( + [ + {'citeType': 'verse', 'ref': 'Luke 1:1', "level": 3, "parent": "Luke 1"}, + {'citeType': 'verse', 'ref': 'Luke 1:2', "level": 3, "parent": "Luke 1"}, + {'citeType': 'bloup', 'ref': 'Luke 1#1', "level": 3, "parent": "Luke 1"} + ], + {'citeType': 'chapter', 'ref': 'Luke 1', "level": 2, "parent": "Luke"}, + None + ), "Check that ?ref works" + + assert get_nav(refs[doc.default_tree], paths[doc.default_tree], start_or_ref="Luke", down=1) == ( + [ + {'citeType': 'chapter', 'ref': 'Luke 1', "level": 2, "parent": "Luke"}, + ], + {'citeType': 'book', 'ref': 'Luke', "level": 1, "parent": None}, + None + ), "Check that ?ref works" + + assert get_nav(refs[doc.default_tree], paths[doc.default_tree], start_or_ref=None, end=None, down=2) == ( + [ + {'citeType': 'book', 'ref': 'Luke', "level": 1, "parent": None}, + {'citeType': 'chapter', 'ref': 'Luke 1', "level": 2, "parent": "Luke"}, + {'citeType': 'book', 'ref': 'Mark', "level": 1, "parent": None}, + {'citeType': 'chapter', 'ref': 'Mark 1', "level": 2, "parent": "Mark"} + ], + None, + None + ), "Check that down=2 works" \ No newline at end of file diff --git a/tests/test_tei.py b/tests/test_tei.py index a9f367e..6107027 100644 --- a/tests/test_tei.py +++ b/tests/test_tei.py @@ -1,7 +1,7 @@ import os.path import pytest -from dapitains.tei.tei import Document +from dapitains.tei.document import Document from lxml.etree import tostring local_dir = os.path.join(os.path.dirname(__file__), "tei")