Skip to content

Commit

Permalink
add util method to generate entities from python dict|list object (#12)
Browse files Browse the repository at this point in the history
* Added is_attribute to EntityPath

* Added sub_schemata to EntitySchema

* Added doc to sub_schemata

* Fixed doc for sub_schemata

* Fixed doc for sub_schemata

* remove pillow dependency

* add build_entities_from_data to utils method

* construct sub entities

* fix mypy issues

* add python-ulid dependency

* support bool, int, float datatypes

* iterate all over the object to generate schema paths

* switch to python sdk

* merge sub_entities of same type

* support list type

* add __str__ method to EntityPath and EntitySchema

* add simple test case for build_entities_from_data

* fix Pep8 E501 in entity.py

* fix mypy issues

* add __str__ and __eq__ for EntitySchema and EntityPath

* compare object instead of comparing string representations

* add test case with array object in first level

* fix mypy issues

* refactor utils module to package

* test case to validate with two level in json object

* Renamed EntitySchema and EntityPath attributes.

* move to python module

* update EntityPath attributes

* add array and empty json tests

* refactor utils module

* fix flake8 issue

* update changelog

---------

Co-authored-by: Robert Isele <[email protected]>
Co-authored-by: Sebastian Tramp <[email protected]>
  • Loading branch information
3 people authored Jan 9, 2024
1 parent f56db7b commit a6e1e9f
Show file tree
Hide file tree
Showing 9 changed files with 693 additions and 118 deletions.
2 changes: 1 addition & 1 deletion .idea/cmem-plugin-base.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions .python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.11.1
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](http://keepachangelog.com/) and this project adheres to [Semantic Versioning](https://semver.org/)

## [Unreleased]

### Added

- add util method to generate entities from python dict | list object


## [4.4.0] 2023-11-24

### Added
Expand Down
27 changes: 27 additions & 0 deletions cmem_plugin_base/dataintegration/entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,19 @@ def __init__(self, path: str,
self.is_relation = is_relation
self.is_single_value = is_single_value

def __repr__(self):
obj = {
'path': self.path, 'is_relation': self.is_relation,
'is_single_value': self.is_single_value
}
return f"EntityPath({obj})"

def __eq__(self, other):
return (isinstance(other, EntityPath)
and self.path == other.path
and self.is_relation == other.is_relation
and self.is_single_value == other.is_single_value)


class EntitySchema:
"""An entity schema.
Expand All @@ -40,6 +53,20 @@ def __init__(self,
self.path_to_root = path_to_root
self.sub_schemata = sub_schemata

def __repr__(self):
obj = {
"type_uri": self.type_uri, "paths": self.paths,
"path_to_root": self.path_to_root
}
return f"EntitySchema({obj})"

def __eq__(self, other):
return (isinstance(other, EntitySchema)
and self.type_uri == other.type_uri
and self.paths == other.paths
and self.path_to_root == other.path_to_root
and self.sub_schemata == other.sub_schemata)


class Entity:
"""An Entity can represent an instance of any given concept.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def split_task_id(task_id: str) -> tuple:


def write_to_dataset(
dataset_id: str, file_resource=None, context: Optional[UserContext] = None
dataset_id: str, file_resource=None, context: Optional[UserContext] = None
):
"""Write to a dataset.
Expand Down
213 changes: 213 additions & 0 deletions cmem_plugin_base/dataintegration/utils/entity_builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
"""utils module for building entities from python objects dict|list."""
from typing import Optional, Union

from ulid import ULID

from cmem_plugin_base.dataintegration.entity import (
Entities, Entity, EntityPath
)
from cmem_plugin_base.dataintegration.entity import EntitySchema


def merge_path_values(paths_map1, paths_map2):
"""
Merge two dictionaries representing paths and values.
This function takes two dictionaries, `paths_map1` and `paths_map2`,
each representing paths and corresponding values. It merges these dictionaries
by combining values for common paths and returns the merged dictionary.
Args:
paths_map1 (dict): The first dictionary containing paths and values.
paths_map2 (dict): The second dictionary containing paths and values.
Returns:
dict: A merged dictionary containing combined values for common paths.
"""
for key, value in paths_map2.items():
current_path_map = {}
if paths_map1.get(key) is not None:
current_path_map = paths_map1[key]
current_path_map = current_path_map | value
paths_map1[key] = current_path_map
return paths_map1


def generate_paths_from_data(data, path='root'):
"""
Generate a dictionary representing paths and data types from a nested JSON
structure.
This function recursively traverses a nested JSON structure ('data') and builds
a dictionary ('paths_map') where keys are paths and values are dictionaries
containing keys and their corresponding data types.
Args:
data (dict or list): The nested JSON structure to traverse.
path (str, optional): The current path (used for recursion). Default is 'root'.
Returns:
dict: A dictionary representing paths and data types.
"""
paths_map = {}
if isinstance(data, list):
for _ in data:
paths_map = merge_path_values(paths_map,
generate_paths_from_data(_, path=path))
if isinstance(data, dict):
key_to_type_map = {}
for key, value in data.items():
key_to_type_map[key] = type(value).__name__
if key_to_type_map[key] == 'dict':
sub_path = f"{path}/{key}"
paths_map = merge_path_values(paths_map,
generate_paths_from_data(data=value,
path=sub_path))
if key_to_type_map[key] == 'list':
for _ in value:
if isinstance(_, dict):
key_to_type_map[key] = 'list_dict'
sub_path = f"{path}/{key}"
paths_map = merge_path_values(paths_map,
generate_paths_from_data(
data=_,
path=sub_path))
paths_map[path] = key_to_type_map
return paths_map


def _get_schema(data: Union[dict, list]):
"""Get the schema of an entity."""
if not data:
return None
paths_map = generate_paths_from_data(data=data)
path_to_schema_map = {}
for path, key_to_type_map in paths_map.items():
schema_paths = []
for _key, _type in key_to_type_map.items():
schema_paths.append(
EntityPath(
path=_key,
is_relation=_type in ('dict', 'list_dict'),
is_single_value=_type not in ('list', 'list_dict')
)
)
schema = EntitySchema(
type_uri="",
paths=schema_paths,
)
path_to_schema_map[path] = schema
return path_to_schema_map


def extend_path_list(path_to_entities, sub_path_to_entities):
"""
Extend a dictionary of paths to entities by merging with another.
This function takes two dictionaries, `path_to_entities` and `sub_path_to_entities`,
representing paths and lists of entities. It extends the lists of entities for each
path in `path_to_entities` by combining them with corresponding lists in
`sub_path_to_entities`.
Args:
path_to_entities (dict): The main dictionary of paths to entities.
sub_path_to_entities (dict): The dictionary of additional paths to entities.
Returns:
None: The result is modified in-place. `path_to_entities` is extended with
entities from `sub_path_to_entities`.
"""
for key, sub_entities in sub_path_to_entities.items():
entities = path_to_entities.get(key, [])
entities.extend(sub_entities)
path_to_entities[key] = entities


def _get_entity(
path_from_root,
path_to_schema_map,
data,
):
"""Get an entity based on the schema and data."""
path_to_entities = {}
entity_uri = f"urn:x-ulid:{ULID()}"
values = []
schema = path_to_schema_map[path_from_root]
for _ in schema.paths:
if data.get(_.path) is None:
values.append([''])
elif not _.is_relation:
values.append(
[f"{data.get(_.path)}"]
if _.is_single_value
else
[f"{_v}" for _v in data.get(_.path)]
)
else:
_data = [data.get(_.path)] if _.is_single_value else data.get(_.path)
sub_entities_uri = []
for _v in _data:
sub_entity_path = f"{path_from_root}/{_.path}"
sub_path_to_entities = _get_entity(
path_from_root=sub_entity_path,
path_to_schema_map=path_to_schema_map,
data=_v,
)
sub_entity = sub_path_to_entities[sub_entity_path].pop()
sub_entities_uri.append(sub_entity.uri)
sub_path_to_entities[sub_entity_path].append(sub_entity)
extend_path_list(path_to_entities, sub_path_to_entities)
values.append(sub_entities_uri)
entity = Entity(uri=entity_uri, values=values)
entities = path_to_entities.get(path_from_root, [])
entities.append(entity)
path_to_entities[path_from_root] = entities
return path_to_entities


def _get_entities(
data: Union[dict, list],
path_to_schema_map: dict[str, EntitySchema],
) -> dict[str, list[Entity]]:
"""
Get entities based on the schema, data, and sub-entities.
"""
path_to_entities: dict[str, list[Entity]] = {}
if isinstance(data, list):
for _ in data:
sub_path_to_entities = _get_entity(
path_from_root="root",
path_to_schema_map=path_to_schema_map,
data=_
)
extend_path_list(path_to_entities, sub_path_to_entities)
else:
path_to_entities = _get_entity(
path_from_root="root",
path_to_schema_map=path_to_schema_map,
data=data,
)
return path_to_entities


def build_entities_from_data(data: Union[dict, list]) -> Optional[Entities]:
"""
Get entities from a data object.
"""
path_to_schema_map = _get_schema(data)
if not path_to_schema_map:
return None
path_to_entities = _get_entities(
data=data,
path_to_schema_map=path_to_schema_map,
)
return Entities(
entities=iter(path_to_entities.get('root')), # type: ignore[arg-type]
schema=path_to_schema_map['root'],
sub_entities=[
Entities(
entities=iter(value),
schema=path_to_schema_map[key]
) for key, value in path_to_entities.items() if key != 'root'
]
)
Loading

0 comments on commit a6e1e9f

Please sign in to comment.