glossary-sync.py

import logging
import os
import pathlib
from typing import Dict, Iterable, List, Optional, Set, Tuple, TypeVar, Union

import click
import ruamel.yaml.util
from ruamel.yaml import YAML

from datahub.ingestion.graph.client import (
    DatahubClientConfig,
    DataHubGraph,
    get_default_graph,
)
from datahub.ingestion.source.metadata.business_glossary import (
    BusinessGlossaryConfig,
    BusinessGlossaryFileSource,
    DefaultConfig,
    GlossaryNodeConfig,
    GlossaryNodeInterface,
    GlossaryTermConfig,
    KnowledgeCard,
    Owners,
    materialize_all_node_urns,
    populate_path_vs_id,
)
from datahub.metadata.schema_classes import (
    AspectBag,
    DomainsClass,
    GlossaryNodeInfoClass,
    GlossaryRelatedTermsClass,
    GlossaryTermInfoClass,
    InstitutionalMemoryClass,
    OwnershipClass,
)
from datahub.utilities.urns.urn import guess_entity_type

logger = logging.getLogger(__name__)

ParentUrn = str
TEST_MODE = os.getenv("TEST_MODE", "false").lower() == "true"

_DELETED_NODE_TOMBSTONE = dict()

_GlossaryElement = TypeVar("_GlossaryElement", GlossaryNodeConfig, GlossaryTermConfig)


def get_graph() -> DataHubGraph:
    if TEST_MODE:
        return get_default_graph()
    else:
        DATAHUB_SERVER = os.environ["DATAHUB_GMS_HOST"]
        DATAHUB_TOKEN: Optional[str] = os.getenv("DATAHUB_GMS_TOKEN")

        return DataHubGraph(
            DatahubClientConfig(server=DATAHUB_SERVER, token=DATAHUB_TOKEN)
        )


def _get_id_from_urn(urn: str) -> str:
    return urn.split(":")[-1]


def _datahub_ownership_to_owners(
    ownership: Optional[OwnershipClass],
) -> Optional[Owners]:
    if ownership is None:
        return None

    owner_urns = [owner.owner for owner in ownership.owners]

    return Owners(
        users=[
            _get_id_from_urn(urn)
            for urn in owner_urns
            if guess_entity_type(urn) == "corpuser"
        ]
        or None,
        groups=[
            _get_id_from_urn(urn)
            for urn in owner_urns
            if guess_entity_type(urn) == "corpGroup"
        ]
        or None,
    )


def _datahub_domain_to_str(domain: Optional[DomainsClass]) -> Optional[str]:
    if domain is None or not domain.domains:
        return None

    if len(domain.domains) > 1:
        logger.warning(f"Found multiple domains for {domain} - using first one")
        return None

    # TODO: Need to look up the domain name from the urn.
    return domain.domains[0]


def _datahub_institutional_memory_to_knowledge_links(
    institutionalMemory: Optional[InstitutionalMemoryClass],
) -> Optional[List[KnowledgeCard]]:
    if institutionalMemory is None:
        return None

    return [
        KnowledgeCard(
            url=element.url,
            label=element.description,
        )
        for element in institutionalMemory.elements
    ]


def _glossary_node_from_datahub(
    urn: str, aspects: AspectBag
) -> Tuple[GlossaryNodeConfig, Optional[ParentUrn]]:
    info_aspect: GlossaryNodeInfoClass = aspects["glossaryNodeInfo"]

    owners = aspects.get("ownership")
    institutionalMemory = aspects.get("institutionalMemory")

    node = GlossaryNodeConfig(
        id=urn,
        name=info_aspect.name or _get_id_from_urn(urn),
        description=info_aspect.definition,
        owners=_datahub_ownership_to_owners(owners),
        knowledge_links=_datahub_institutional_memory_to_knowledge_links(
            institutionalMemory
        ),
        # These are populated later.
        terms=[],
        nodes=[],
    )
    node._urn = urn
    parent_urn = info_aspect.parentNode

    return node, parent_urn


def _glossary_term_from_datahub(
    urn: str, aspects: AspectBag
) -> Tuple[GlossaryTermConfig, Optional[ParentUrn]]:
    info_aspect: GlossaryTermInfoClass = aspects["glossaryTermInfo"]

    related_terms: GlossaryRelatedTermsClass = aspects.get(
        "glossaryRelatedTerms", GlossaryRelatedTermsClass()
    )
    owners = aspects.get("ownership")
    domain = aspects.get("domains")
    institutionalMemory = aspects.get("institutionalMemory")

    term = GlossaryTermConfig(
        id=urn,
        name=info_aspect.name or _get_id_from_urn(urn),
        description=info_aspect.definition,
        term_source=info_aspect.termSource,
        source_ref=info_aspect.sourceRef,
        source_url=info_aspect.sourceUrl,
        owners=_datahub_ownership_to_owners(owners),
        custom_properties=info_aspect.customProperties or None,
        knowledge_links=_datahub_institutional_memory_to_knowledge_links(
            institutionalMemory
        ),
        domain=_datahub_domain_to_str(domain),
        # Where possible, these are converted into biz glossary paths later.
        inherits=related_terms.isRelatedTerms,
        contains=related_terms.hasRelatedTerms,
        values=related_terms.values,
        related_terms=related_terms.relatedTerms,
    )

    term._urn = urn
    parent_urn = info_aspect.parentNode

    return term, parent_urn


def fetch_datahub_glossary():
    graph = get_graph()

    # Get all the urns in the glossary.
    urns = list(graph.get_urns_by_filter(entity_types=["glossaryTerm", "glossaryNode"]))

    # Hydrate them into entities.
    entities = {urn: graph.get_entity_semityped(urn) for urn in urns}

    # Map these into pydantic models defined in the biz glossary source.
    # 1. Map each AspectBag -> pydantic model.
    # 2. Construct the hierarchy of pydantic models using the lookup table.

    # Parse glossary nodes.
    raw_nodes = {
        urn: _glossary_node_from_datahub(urn, entities[urn])
        for urn in urns
        if guess_entity_type(urn) == "glossaryNode"
    }

    # Construct the hierarchy of nodes.
    top_level_nodes: List[GlossaryNodeConfig] = []
    for (node, parent_urn) in raw_nodes.values():
        if parent_urn is None:
            top_level_nodes.append(node)
        else:
            parent_node, _ = raw_nodes[parent_urn]
            parent_node.nodes = parent_node.nodes or []
            parent_node.nodes.append(node)

    # Parse glossary terms.
    raw_terms = {
        urn: _glossary_term_from_datahub(urn, entities[urn])
        for urn in urns
        if guess_entity_type(urn) == "glossaryTerm"
    }

    # Construct the hierarchy of terms.
    top_level_terms: List[GlossaryTermConfig] = []
    for (term, parent_urn) in raw_terms.values():
        if parent_urn is None:
            top_level_terms.append(term)
        else:
            parent_node, _ = raw_nodes[parent_urn]
            parent_node.terms = parent_node.terms or []
            parent_node.terms.append(term)

    return top_level_nodes, top_level_terms


def prune_latest_glossary(
    latest_glossary: BusinessGlossaryConfig, existing_glossary: BusinessGlossaryConfig
) -> None:
    # TODO: Update this logic to allow for pruning of nodes within the hierarchy as well.

    allowed_node_urns = set(node._urn for node in (existing_glossary.nodes or []))
    allowed_term_urns = set(term._urn for term in (existing_glossary.terms or []))

    latest_glossary.nodes = [
        node for node in (latest_glossary.nodes or []) if node._urn in allowed_node_urns
    ]
    latest_glossary.terms = [
        term for term in (latest_glossary.terms or []) if term._urn in allowed_term_urns
    ]


def replace_urn_refs_with_paths(
    glossary: BusinessGlossaryConfig, path_to_id_map: Dict[str, str]
) -> None:
    urn_to_path_map = {urn: path for (path, urn) in path_to_id_map.items()}

    def _simplify_urn_list(urns: Optional[List[str]]) -> Optional[List[str]]:
        if urns is None:
            return None

        return [urn_to_path_map.get(urn, urn) for urn in urns]

    def _process_child_terms(parent_node: GlossaryNodeInterface) -> None:
        for term in parent_node.terms or []:
            term.inherits = _simplify_urn_list(term.inherits)
            term.contains = _simplify_urn_list(term.contains)
            term.values = _simplify_urn_list(term.values)
            term.related_terms = _simplify_urn_list(term.related_terms)

        for node in parent_node.nodes or []:
            _process_child_terms(node)

    _process_child_terms(glossary)


def _align_glossary_elements(
    latest: List[_GlossaryElement], existing: List[_GlossaryElement]
) -> Iterable[Tuple[Optional[_GlossaryElement], Optional[_GlossaryElement]]]:
    latest_by_id = {element._urn: element for element in latest}
    latest_ids = set(latest_by_id.keys())

    # Emit all existing elements first, matching where with latest where possible.
    for existing_element in existing:
        if existing_element._urn in latest_ids:
            latest_ids.remove(existing_element._urn)
            yield latest_by_id[existing_element._urn], existing_element
        else:
            yield None, existing_element

    # Emit all new elements.
    for latest_id in latest_ids:
        yield latest_by_id[latest_id], None


def glossary_to_dict_minimize_diffs(
    latest_glossary: BusinessGlossaryConfig, existing_glossary: BusinessGlossaryConfig
) -> dict:
    def _simple_elem_to_dict(
        latest_elem: Union[BusinessGlossaryConfig, _GlossaryElement],
        existing_elem: Union[None, BusinessGlossaryConfig, _GlossaryElement],
        defaults: DefaultConfig,
        exclude: Optional[Set[str]] = None,
    ) -> dict:
        if isinstance(latest_elem, BusinessGlossaryConfig):
            return latest_elem.dict(
                exclude=exclude, exclude_defaults=True, exclude_none=True
            )
        assert not isinstance(existing_elem, BusinessGlossaryConfig)

        # Exclude fields that are default values here AND are not set in the existing glossary.
        #
        # In other words, a field will be included if:
        # 1. is set in the existing glossary or
        # 2. the value in the latest glossary is not a default value (and this isn't the top-level config)
        exclude = (exclude or set()).copy()

        # Exclude any fields that match the defaults.
        if defaults.owners is not None and defaults.owners == latest_elem.owners:
            exclude.add("owners")
        if isinstance(latest_elem, GlossaryTermConfig):
            if (
                defaults.source is not None
                and defaults.source == latest_elem.source_ref
            ):
                exclude.add("source_ref")
            if defaults.url == latest_elem.source_url:
                exclude.add("source_url")
            if defaults.source_type == latest_elem.term_source:
                exclude.add("term_source")

        if existing_elem is not None:
            # SUBTLE: We can drop the ID here because we know that existing_elem._urn
            # matches latest_elem._urn. If existing_elem is not None, then
            # we know the id field is redundant if it is not set in the existing glossary.
            exclude.add("id")

            # Make sure to include any fields that are explicitly set in the existing glossary.
            existing_set_keys = existing_elem.__fields_set__
            exclude -= existing_set_keys

        fields = latest_elem.dict(
            exclude=exclude,
            exclude_defaults=True,
            exclude_none=True,
        )

        return fields

    def _to_dict(
        latest_node: GlossaryNodeInterface,
        existing_node: GlossaryNodeInterface,
        defaults: DefaultConfig,
    ) -> dict:
        # Process terms.
        terms = []
        for latest_term_elem, existing_term_elem in _align_glossary_elements(
            latest_node.terms or [], existing_node.terms or []
        ):
            if latest_term_elem is None:
                terms.append(_DELETED_NODE_TOMBSTONE)
            else:
                terms.append(
                    _simple_elem_to_dict(latest_term_elem, existing_term_elem, defaults)
                )

        # Process nodes.
        nodes = []
        for latest_node_elem, existing_node_elem in _align_glossary_elements(
            latest_node.nodes or [], existing_node.nodes or []
        ):
            if latest_node_elem is None:
                nodes.append(_DELETED_NODE_TOMBSTONE)
            elif existing_node_elem is None:
                nodes.append(
                    _to_dict(
                        latest_node_elem,
                        existing_node=GlossaryNodeConfig(
                            id=latest_node_elem._urn,
                            name=latest_node_elem.name,
                            description="",
                        ),
                        defaults=defaults,
                    )
                )
            else:
                nodes.append(
                    _to_dict(
                        latest_node_elem,
                        existing_node_elem,
                        # Update defaults with the current node owner, if any.
                        # This way, the "owners" field cascades down to all child nodes.
                        defaults=defaults.copy(
                            update=dict(owners=existing_node_elem.owners)
                            if existing_node_elem.owners
                            else dict()
                        ),
                    )
                )

        # Retain other fields as-is.
        other_fields = _simple_elem_to_dict(
            latest_node, existing_node, defaults, exclude={"terms", "nodes"}
        )

        return {
            **other_fields,
            **({"terms": terms} if terms else {}),
            **({"nodes": nodes} if nodes else {}),
        }

    return _to_dict(latest_glossary, existing_glossary, defaults=existing_glossary)


def update_yml_to_match(
    infile: pathlib.Path,
    outfile: pathlib.Path,
    target: dict,
) -> None:
    yaml = YAML()
    yaml.preserve_quotes = True  # type: ignore[assignment]

    doc = yaml.load(infile)

    def _update_doc(doc, target, update_full):
        if isinstance(target, dict):
            if not isinstance(doc, dict):
                update_full(target)
            else:
                for key, value in target.items():
                    if key not in doc:
                        doc[key] = value
                    else:

                        def _update_value(v):
                            doc[key] = v

                        _update_doc(doc[key], value, _update_value)

        elif isinstance(target, list):
            if not isinstance(doc, list):
                update_full(doc, target)
            else:
                # We assume that the two lists are perfectly aligned, which the exception of deletions.
                elems_to_delete = []
                for i, value in enumerate(target):
                    if i >= len(doc):
                        doc.append(value)
                    elif doc[i] == _DELETED_NODE_TOMBSTONE:
                        elems_to_delete.append(i)
                    else:

                        def _update_value(v):
                            doc[i] = v

                        _update_doc(doc[i], value, _update_value)

                for i in reversed(elems_to_delete):
                    del doc[i]
        else:
            update_full(target)

    _update_doc(doc, target, None)

    # Guess existing indentation in the file so that we can preserve it.
    _, ind, bsi = ruamel.yaml.util.load_yaml_guess_indent(infile.read_text())
    yaml.width = 2**20  # type: ignore[assignment]
    yaml.indent(mapping=bsi, sequence=ind, offset=bsi)

    yaml.dump(doc, outfile)


def _update_glossary_file(
    file: str, enable_auto_id: bool, output: Optional[str], prune: bool = True
) -> None:
    if not output:
        output = file

    # Read the existing biz glossary file.
    existing_glossary = BusinessGlossaryFileSource.load_glossary_config(file)
    materialize_all_node_urns(existing_glossary, enable_auto_id=enable_auto_id)

    # Fetch the latest glossary from DataHub.
    top_level_nodes, top_level_terms = fetch_datahub_glossary()
    latest_glossary = existing_glossary.copy(
        update=dict(
            nodes=top_level_nodes,
            terms=top_level_terms,
        ),
        deep=True,
    )

    # Prune the latest glossary to only include file-managed nodes/terms
    if prune:
        prune_latest_glossary(
            latest_glossary=latest_glossary,
            existing_glossary=existing_glossary,
        )

    # Recursively simplify urn references where possible.
    path_to_id_map = populate_path_vs_id(latest_glossary)
    replace_urn_refs_with_paths(latest_glossary, path_to_id_map=path_to_id_map)

    # Minimize diffs between the two glossaries using the field defaults and default config.
    obj = glossary_to_dict_minimize_diffs(
        latest_glossary=latest_glossary,
        existing_glossary=existing_glossary,
    )

    # Serialize back into yaml.
    update_yml_to_match(pathlib.Path(file), pathlib.Path(output), obj)


@click.group()
def cli():
    pass


@cli.command()
@click.option("--enable-auto-id", type=bool, required=True)
@click.option("--file", type=click.Path(exists=True, dir_okay=False), required=True)
@click.option("--prune", type=bool, default=False, required=False)
@click.option("--output", type=click.Path())
def update_glossary_file(
    file: str, enable_auto_id: bool, prune: bool, output: Optional[str]
) -> None:
    if not output:
        output = file

    _update_glossary_file(
        file, enable_auto_id=enable_auto_id, output=output, prune=prune
    )


@cli.command()
@click.option("--output", type=click.Path(), required=True)
@click.option("--default-source-ref")
@click.option("--default-source-url")
@click.option("--default-source-type")
def bootstrap_glossary_yml(
    output: str,
    default_source_ref: Optional[str],
    default_source_url: Optional[str],
    default_source_type: Optional[str],
) -> None:
    default_yml = """
# This file was auto-generated by the biz glossary CLI.
version: 1
owners: {}
"""
    if default_source_ref:
        default_yml += f"source: {default_source_ref}\n"
    if default_source_url:
        default_yml += f"url: {default_source_url}\n"
    if default_source_type:
        default_yml += f"source_type: {default_source_type}\n"

    pathlib.Path(output).write_text(default_yml)

    _update_glossary_file(output, enable_auto_id=True, output=output, prune=False)


if __name__ == "__main__":
    cli()