Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added functionality to run and use siegfried #12

Merged
merged 12 commits into from
Oct 17, 2023
16 changes: 13 additions & 3 deletions acacore/database/base.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,23 @@
from datetime import datetime
from os import PathLike
from pathlib import Path
from sqlite3 import Connection, OperationalError
from sqlite3 import Connection
from sqlite3 import Cursor as SQLiteCursor
from typing import Any, Generator, Generic, Optional, Type, TypeVar, Union, overload
from sqlite3 import OperationalError
from typing import Any
from typing import Generator
from typing import Generic
from typing import Optional
from typing import Type
from typing import TypeVar
from typing import Union
from typing import overload

from pydantic.main import BaseModel

from .column import Column, SelectColumn, model_to_columns
from .column import Column
from .column import SelectColumn
from .column import model_to_columns

T = TypeVar("T")
R = TypeVar("R")
Expand Down
7 changes: 6 additions & 1 deletion acacore/database/column.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
from datetime import datetime
from pathlib import Path
from typing import Callable, Generic, Optional, Type, TypeVar, Union
from typing import Callable
from typing import Generic
from typing import Optional
from typing import Type
from typing import TypeVar
from typing import Union
from uuid import UUID

from pydantic import BaseModel
Expand Down
41 changes: 20 additions & 21 deletions acacore/database/files_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,16 @@

class FileDB(FileDBBase):
def __init__(
self,
database: Union[str, bytes, PathLike[str], PathLike[bytes]],
*,
timeout: float = 5.0,
detect_types: int = 0,
isolation_level: Optional[str] = "DEFERRED",
check_same_thread: bool = True,
factory: Optional[Type[Connection]] = Connection,
cached_statements: int = 100,
uri: bool = False,
self,
database: Union[str, bytes, PathLike[str], PathLike[bytes]],
*,
timeout: float = 5.0,
detect_types: int = 0,
isolation_level: Optional[str] = "DEFERRED",
check_same_thread: bool = True,
factory: Optional[Type[Connection]] = Connection,
cached_statements: int = 100,
uri: bool = False,
) -> None:
"""
A class that handles the SQLite database used by AArhus City Archives to process data archives.
Expand All @@ -45,8 +45,7 @@ def __init__(
to avoid parsing overhead.
uri: If set to True, database is interpreted as a URI with a file path and an optional query string.
"""
from acacore.models.file import ConvertedFile
from acacore.models.file import File
from acacore.models.file import ConvertedFile, File
from acacore.models.history import HistoryEntry
from acacore.models.identification import SignatureCount
from acacore.models.metadata import Metadata
Expand Down Expand Up @@ -135,20 +134,20 @@ def is_empty(self) -> bool:
return not self.files.select(limit=1).fetchone()

def add_history(
self,
uuid: UUID,
operation: str,
data: Optional[Union[dict, list, str, int, float, bool, datetime]],
reason: Optional[str] = None,
*,
time: Optional[datetime] = None,
self,
uuid: UUID,
operation: str,
data: Any, # noqa: ANN401
reason: Optional[str] = None,
*,
time: Optional[datetime] = None,
):
self.history.insert(
self.history.model(
uuid=uuid,
operation=operation,
data=data,
reason=reason,
time=time or datetime.utcnow(),
)
time=time or datetime.now(), # noqa: DTZ005
),
)
5 changes: 3 additions & 2 deletions acacore/models/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
from pathlib import Path
from typing import Optional

from pydantic import UUID4, Field
from pydantic import Field
from pydantic import UUID4

from acacore.utils.io import size_fmt

from .base import ACABase
from .identification import Identification


# -----------------------------------------------------------------------------
# Model
# -----------------------------------------------------------------------------
Expand Down
5 changes: 3 additions & 2 deletions acacore/models/file_data.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from pathlib import Path
from typing import Any, ClassVar, Optional
from typing import Any
from typing import ClassVar
from typing import Optional

from pydantic import model_validator

from acacore.database.files_db import FileDB

from .base import ACABase
from .file import ArchiveFile

Expand Down
3 changes: 2 additions & 1 deletion acacore/models/identification.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import Any, Optional
from typing import Any
from typing import Optional

from pydantic import model_validator

Expand Down
2 changes: 2 additions & 0 deletions acacore/reference_files/__init__.py
Magniler marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
"""Collection of methods that allows us to """
from . import ref_files
46 changes: 46 additions & 0 deletions acacore/reference_files/ref_files.py
Magniler marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import json
from functools import lru_cache
from http.client import HTTPResponse
from urllib import request


@lru_cache
def to_re_identify() -> dict[str, str]:
"""Gets the json file with the different formats that we wish to reidentify.

Is kept updated on the reference-files repo. The function caches the result,
soo multiple calls in the same run should not be an issue.
"""
response: HTTPResponse = request.urlopen(
"https://raw.githubusercontent.com/aarhusstadsarkiv/reference-files/main/to_reidentify.json",
)
if response.getcode() != 200:
raise ConnectionError

re_identify_map: dict[str, str] = json.loads(response.read())

if re_identify_map is None:
raise ConnectionError

return re_identify_map


@lru_cache
def costum_sigs() -> list[dict]:
"""Gets the json file with our own costum formats in a list.

Is kept updated on the reference-files repo. The function caches the result,
soo multiple calls in the same run should not be an issue.
"""
response: HTTPResponse = request.urlopen(
"https://raw.githubusercontent.com/aarhusstadsarkiv/reference-files/main/custom_signatures.json",
)
if response.getcode() != 200:
raise ConnectionError

re_identify_map: dict[str, str] = json.loads(response.read())

if re_identify_map is None:
raise ConnectionError

return re_identify_map
2 changes: 2 additions & 0 deletions acacore/siegfried_utils/__init__.py
Magniler marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
"""Place for functions and classes that act as an entrance to siegfried"""
from . import identify
Loading
Loading