Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add util method to generate entities from python dict|list object #12

Merged
merged 35 commits into from
Jan 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
3bfae0c
Added is_attribute to EntityPath
robertisele Nov 30, 2023
f86698b
Added sub_schemata to EntitySchema
robertisele Nov 30, 2023
27c7245
Added doc to sub_schemata
robertisele Nov 30, 2023
9e38ace
Merge remote-tracking branch 'origin/feature/addIsAttribute-CMEM-4492…
robertisele Nov 30, 2023
23a0c85
Fixed doc for sub_schemata
robertisele Nov 30, 2023
4f6902b
Fixed doc for sub_schemata
robertisele Nov 30, 2023
90fe95b
remove pillow dependency
msaipraneeth Dec 4, 2023
918c36d
add build_entities_from_data to utils method
msaipraneeth Dec 4, 2023
ad1e105
construct sub entities
msaipraneeth Dec 6, 2023
e83ea63
fix mypy issues
msaipraneeth Dec 6, 2023
6bdf908
add python-ulid dependency
msaipraneeth Dec 8, 2023
12d307e
support bool, int, float datatypes
msaipraneeth Dec 8, 2023
154b38a
iterate all over the object to generate schema paths
msaipraneeth Dec 8, 2023
6a7ebc8
switch to python sdk
seebi Dec 8, 2023
ac82a71
merge sub_entities of same type
msaipraneeth Dec 12, 2023
b0a0a91
support list type
msaipraneeth Dec 12, 2023
96426af
add __str__ method to EntityPath and EntitySchema
msaipraneeth Dec 12, 2023
540dde2
add simple test case for build_entities_from_data
msaipraneeth Dec 12, 2023
a488b3e
fix Pep8 E501 in entity.py
msaipraneeth Dec 12, 2023
da63d3b
fix mypy issues
msaipraneeth Dec 12, 2023
5624ea0
add __str__ and __eq__ for EntitySchema and EntityPath
msaipraneeth Dec 14, 2023
b3b2db7
compare object instead of comparing string representations
msaipraneeth Dec 14, 2023
6c33fbb
add test case with array object in first level
msaipraneeth Dec 14, 2023
8a6c1e9
fix mypy issues
msaipraneeth Dec 14, 2023
e77e2e6
refactor utils module to package
msaipraneeth Dec 14, 2023
9d0ae54
test case to validate with two level in json object
msaipraneeth Dec 14, 2023
46f1359
Renamed EntitySchema and EntityPath attributes.
robertisele Dec 15, 2023
dcbd871
construct sub entities
msaipraneeth Dec 30, 2023
c7b03fa
move to python module
msaipraneeth Dec 30, 2023
d236967
update EntityPath attributes
msaipraneeth Dec 30, 2023
f8ad72d
add array and empty json tests
msaipraneeth Dec 30, 2023
59cdcb3
refactor utils module
msaipraneeth Jan 3, 2024
d71ee0f
fix flake8 issue
msaipraneeth Jan 3, 2024
9bc90c6
Merge branch 'main' into feature/objectToEntities-ECC-5287
msaipraneeth Jan 9, 2024
c51bdde
update changelog
msaipraneeth Jan 9, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .idea/cmem-plugin-base.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions .python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.11.1
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](http://keepachangelog.com/) and this project adheres to [Semantic Versioning](https://semver.org/)

## [Unreleased]

### Added

- add util method to generate entities from python dict | list object


## [4.4.0] 2023-11-24

### Added
Expand Down
27 changes: 27 additions & 0 deletions cmem_plugin_base/dataintegration/entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,19 @@ def __init__(self, path: str,
self.is_relation = is_relation
self.is_single_value = is_single_value

def __repr__(self):
obj = {
'path': self.path, 'is_relation': self.is_relation,
'is_single_value': self.is_single_value
}
return f"EntityPath({obj})"

def __eq__(self, other):
return (isinstance(other, EntityPath)
and self.path == other.path
and self.is_relation == other.is_relation
and self.is_single_value == other.is_single_value)


class EntitySchema:
"""An entity schema.
Expand All @@ -40,6 +53,20 @@ def __init__(self,
self.path_to_root = path_to_root
self.sub_schemata = sub_schemata

def __repr__(self):
obj = {
"type_uri": self.type_uri, "paths": self.paths,
"path_to_root": self.path_to_root
}
return f"EntitySchema({obj})"

def __eq__(self, other):
return (isinstance(other, EntitySchema)
and self.type_uri == other.type_uri
and self.paths == other.paths
and self.path_to_root == other.path_to_root
and self.sub_schemata == other.sub_schemata)


class Entity:
"""An Entity can represent an instance of any given concept.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def split_task_id(task_id: str) -> tuple:


def write_to_dataset(
dataset_id: str, file_resource=None, context: Optional[UserContext] = None
dataset_id: str, file_resource=None, context: Optional[UserContext] = None
):
"""Write to a dataset.

Expand Down
213 changes: 213 additions & 0 deletions cmem_plugin_base/dataintegration/utils/entity_builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
"""utils module for building entities from python objects dict|list."""
from typing import Optional, Union

from ulid import ULID

from cmem_plugin_base.dataintegration.entity import (
Entities, Entity, EntityPath
)
from cmem_plugin_base.dataintegration.entity import EntitySchema


def merge_path_values(paths_map1, paths_map2):
"""
Merge two dictionaries representing paths and values.

This function takes two dictionaries, `paths_map1` and `paths_map2`,
each representing paths and corresponding values. It merges these dictionaries
by combining values for common paths and returns the merged dictionary.

Args:
paths_map1 (dict): The first dictionary containing paths and values.
paths_map2 (dict): The second dictionary containing paths and values.

Returns:
dict: A merged dictionary containing combined values for common paths.
"""
for key, value in paths_map2.items():
current_path_map = {}
if paths_map1.get(key) is not None:
current_path_map = paths_map1[key]
current_path_map = current_path_map | value
paths_map1[key] = current_path_map
return paths_map1


def generate_paths_from_data(data, path='root'):
"""
Generate a dictionary representing paths and data types from a nested JSON
structure.

This function recursively traverses a nested JSON structure ('data') and builds
a dictionary ('paths_map') where keys are paths and values are dictionaries
containing keys and their corresponding data types.

Args:
data (dict or list): The nested JSON structure to traverse.
path (str, optional): The current path (used for recursion). Default is 'root'.

Returns:
dict: A dictionary representing paths and data types.
"""
paths_map = {}
if isinstance(data, list):
for _ in data:
paths_map = merge_path_values(paths_map,
generate_paths_from_data(_, path=path))
if isinstance(data, dict):
key_to_type_map = {}
for key, value in data.items():
key_to_type_map[key] = type(value).__name__
if key_to_type_map[key] == 'dict':
sub_path = f"{path}/{key}"
paths_map = merge_path_values(paths_map,
generate_paths_from_data(data=value,
path=sub_path))
if key_to_type_map[key] == 'list':
for _ in value:
if isinstance(_, dict):
key_to_type_map[key] = 'list_dict'
sub_path = f"{path}/{key}"
paths_map = merge_path_values(paths_map,
generate_paths_from_data(
data=_,
path=sub_path))
paths_map[path] = key_to_type_map
return paths_map


def _get_schema(data: Union[dict, list]):
"""Get the schema of an entity."""
if not data:
return None
paths_map = generate_paths_from_data(data=data)
path_to_schema_map = {}
for path, key_to_type_map in paths_map.items():
schema_paths = []
for _key, _type in key_to_type_map.items():
schema_paths.append(
EntityPath(
path=_key,
is_relation=_type in ('dict', 'list_dict'),
is_single_value=_type not in ('list', 'list_dict')
)
)
schema = EntitySchema(
type_uri="",
paths=schema_paths,
)
path_to_schema_map[path] = schema
return path_to_schema_map


def extend_path_list(path_to_entities, sub_path_to_entities):
"""
Extend a dictionary of paths to entities by merging with another.

This function takes two dictionaries, `path_to_entities` and `sub_path_to_entities`,
representing paths and lists of entities. It extends the lists of entities for each
path in `path_to_entities` by combining them with corresponding lists in
`sub_path_to_entities`.

Args:
path_to_entities (dict): The main dictionary of paths to entities.
sub_path_to_entities (dict): The dictionary of additional paths to entities.

Returns:
None: The result is modified in-place. `path_to_entities` is extended with
entities from `sub_path_to_entities`.
"""
for key, sub_entities in sub_path_to_entities.items():
entities = path_to_entities.get(key, [])
entities.extend(sub_entities)
path_to_entities[key] = entities


def _get_entity(
path_from_root,
path_to_schema_map,
data,
):
"""Get an entity based on the schema and data."""
path_to_entities = {}
entity_uri = f"urn:x-ulid:{ULID()}"
values = []
schema = path_to_schema_map[path_from_root]
for _ in schema.paths:
if data.get(_.path) is None:
values.append([''])
elif not _.is_relation:
values.append(
[f"{data.get(_.path)}"]
if _.is_single_value
else
[f"{_v}" for _v in data.get(_.path)]
)
else:
_data = [data.get(_.path)] if _.is_single_value else data.get(_.path)
sub_entities_uri = []
for _v in _data:
sub_entity_path = f"{path_from_root}/{_.path}"
sub_path_to_entities = _get_entity(
path_from_root=sub_entity_path,
path_to_schema_map=path_to_schema_map,
data=_v,
)
sub_entity = sub_path_to_entities[sub_entity_path].pop()
sub_entities_uri.append(sub_entity.uri)
sub_path_to_entities[sub_entity_path].append(sub_entity)
extend_path_list(path_to_entities, sub_path_to_entities)
values.append(sub_entities_uri)
entity = Entity(uri=entity_uri, values=values)
entities = path_to_entities.get(path_from_root, [])
entities.append(entity)
path_to_entities[path_from_root] = entities
return path_to_entities


def _get_entities(
data: Union[dict, list],
path_to_schema_map: dict[str, EntitySchema],
) -> dict[str, list[Entity]]:
"""
Get entities based on the schema, data, and sub-entities.
"""
path_to_entities: dict[str, list[Entity]] = {}
if isinstance(data, list):
for _ in data:
sub_path_to_entities = _get_entity(
path_from_root="root",
path_to_schema_map=path_to_schema_map,
data=_
)
extend_path_list(path_to_entities, sub_path_to_entities)
else:
path_to_entities = _get_entity(
path_from_root="root",
path_to_schema_map=path_to_schema_map,
data=data,
)
return path_to_entities


def build_entities_from_data(data: Union[dict, list]) -> Optional[Entities]:
"""
Get entities from a data object.
"""
path_to_schema_map = _get_schema(data)
if not path_to_schema_map:
return None
path_to_entities = _get_entities(
data=data,
path_to_schema_map=path_to_schema_map,
)
return Entities(
entities=iter(path_to_entities.get('root')), # type: ignore[arg-type]
schema=path_to_schema_map['root'],
sub_entities=[
Entities(
entities=iter(value),
schema=path_to_schema_map[key]
) for key, value in path_to_entities.items() if key != 'root'
]
)
Loading