diff --git a/dbt_meshify/change.py b/dbt_meshify/change.py index 429eee8..1871805 100644 --- a/dbt_meshify/change.py +++ b/dbt_meshify/change.py @@ -9,6 +9,7 @@ class Operation(str, Enum): """An operation describes the type of work being performed.""" Add = "add" + Append = "append" Update = "update" Remove = "remove" Copy = "copy" @@ -17,6 +18,7 @@ class Operation(str, Enum): prepositions = { Operation.Add: "to", + Operation.Append: "to", Operation.Move: "to", Operation.Copy: "to", Operation.Update: "in", diff --git a/dbt_meshify/dbt_projects.py b/dbt_meshify/dbt_projects.py index 5c34b88..a0ed37f 100644 --- a/dbt_meshify/dbt_projects.py +++ b/dbt_meshify/dbt_projects.py @@ -26,6 +26,7 @@ from dbt_meshify.dbt import Dbt from dbt_meshify.exceptions import FatalMeshifyException +from dbt_meshify.storage.jinja_blocks import JinjaBlock, find_doc_reference class BaseDbtProject: @@ -310,6 +311,30 @@ def __init__( resources = self.select_resources(output_key="unique_id") super().__init__(manifest, project, catalog, name, resources) + self.jinja_blocks: Dict[str, JinjaBlock] = self.find_jinja_blocks() + + def find_jinja_blocks(self) -> Dict[str, JinjaBlock]: + """For a given dbt Project, find all Jinja blocks for docs and macros""" + + blocks = {} + + for unique_id, item in self.manifest.docs.items(): + if item.package_name != self.name: + continue + + blocks[unique_id] = JinjaBlock.from_file( + path=self.path / item.original_file_path, block_type="docs", name=item.name + ) + + for unique_id, macro in self.manifest.macros.items(): + if macro.package_name != self.name: + continue + + blocks[unique_id] = JinjaBlock.from_file( + path=self.path / macro.original_file_path, block_type="macro", name=macro.name + ) + + return blocks def select_resources( self, @@ -396,6 +421,8 @@ def __init__( self.groups = self._get_indirect_groups() self._rename_project() + self._referenced_docs: Optional[Set[str]] = None + def _rename_project(self) -> None: """ edits the project yml to take any instance of the parent project name and update it to the subproject name @@ -432,6 +459,39 @@ def _get_custom_macros(self) -> Set[str]: macros_set.update(self._get_macro_dependencies(macro)) return macros_set + @property + def referenced_docs(self) -> Set[str]: + """Return a list of all docs referenced within this SubProject.""" + + if self._referenced_docs: + return self._referenced_docs + + docs = set() + for unique_id in self.resources: + if unique_id.startswith("test."): + continue + + node = self.get_manifest_node(unique_id) + + if node is None: + raise Exception(f"Unable to find referenced node {node}") + + if hasattr(node, "raw_code"): + docs.update(find_doc_reference(node.raw_code)) + + if hasattr(node, "patch_path"): + path = self.parent_project.resolve_patch_path(node) + if path.exists(): + with open(path) as file: + docs.update(find_doc_reference(file.read())) + + # Use the search name for the doc to resolve a unique_id for the doc resource. + self._referenced_docs = { + unique_id for unique_id, doc in self.manifest.docs.items() if doc.name in docs + } + + return self._referenced_docs + def _get_indirect_groups(self) -> Set[str]: """ get a set of group unique_ids for all the selected resources diff --git a/dbt_meshify/main.py b/dbt_meshify/main.py index 9c3001f..3742e43 100644 --- a/dbt_meshify/main.py +++ b/dbt_meshify/main.py @@ -246,7 +246,8 @@ def split( change_set = subproject_creator.initialize() return [change_set] - except Exception: + except Exception as e: + logger.exception(e) # TODO: Remove this line! raise FatalMeshifyException(f"Error creating subproject {subproject.name}") diff --git a/dbt_meshify/storage/dbt_project_editors.py b/dbt_meshify/storage/dbt_project_editors.py index ecea8ba..2b45c91 100644 --- a/dbt_meshify/storage/dbt_project_editors.py +++ b/dbt_meshify/storage/dbt_project_editors.py @@ -22,6 +22,7 @@ from dbt_meshify.dbt_projects import DbtSubProject from dbt_meshify.storage.file_content_editors import NamedList, filter_empty_dict_items from dbt_meshify.storage.file_manager import YAMLFileManager, yaml +from dbt_meshify.storage.jinja_blocks import JinjaBlock from dbt_meshify.utilities.contractor import Contractor from dbt_meshify.utilities.dependencies import DependenciesUpdater from dbt_meshify.utilities.grouper import ResourceGrouper @@ -141,7 +142,12 @@ def initialize(self) -> ChangeSet: f"Identifying operations required to split {subproject.name} from {subproject.parent_project.name}." ) - for unique_id in subproject.resources | subproject.custom_macros | subproject.groups: + for unique_id in ( + subproject.resources + | subproject.custom_macros + | subproject.groups + | subproject.referenced_docs + ): resource = subproject.get_manifest_node(unique_id) if not resource: raise KeyError(f"Resource {unique_id} not found in manifest") @@ -182,10 +188,20 @@ def initialize(self) -> ChangeSet: ): change_set.extend(reference_updater.update_parent_refs(resource)) - elif resource.resource_type in ["macro", "group"]: + elif resource.resource_type in ["macro", "group", "doc"]: if hasattr(resource, "patch_path") and resource.patch_path: change_set.add(self.copy_resource_yml(resource)) - change_set.add(self.copy_resource(resource)) + + if resource.unique_id in self.subproject.parent_project.jinja_blocks: + change_set.add( + self.copy_jinja_block( + resource, + self.subproject.parent_project.jinja_blocks[resource.unique_id], + ) + ) + + else: + change_set.add(self.copy_resource(resource)) else: logger.debug( @@ -237,6 +253,17 @@ def move_resource(self, resource: Resource) -> FileChange: source=self.subproject.parent_project.resolve_file_path(resource), ) + def copy_jinja_block(self, resource: Resource, jinja_block: JinjaBlock) -> FileChange: + """Move an existing jinja block to a new project""" + + return FileChange( + operation=Operation.Append, + entity_type=EntityType.Code, + identifier=resource.name, + path=self.subproject.resolve_file_path(resource), + data=jinja_block.content, + ) + def copy_resource(self, resource: Resource) -> FileChange: """ Copy a resource file from one project to another diff --git a/dbt_meshify/storage/file_content_editors.py b/dbt_meshify/storage/file_content_editors.py index ecd9997..5f88c44 100644 --- a/dbt_meshify/storage/file_content_editors.py +++ b/dbt_meshify/storage/file_content_editors.py @@ -107,6 +107,18 @@ def add(change: FileChange): RawFileManager.write_file(path=change.path, content=change.data) + @staticmethod + def append(change: FileChange): + """Append data to an existing file.""" + + if not change.path.parent.exists(): + change.path.parent.mkdir(parents=True, exist_ok=True) + + if change.data is None: + return RawFileManager.touch_file(change.path) + + RawFileManager.append_file(path=change.path, content=change.data) + @staticmethod def update(change: FileChange): """Update data to a new file.""" diff --git a/dbt_meshify/storage/file_manager.py b/dbt_meshify/storage/file_manager.py index 76a1e72..e845dcd 100644 --- a/dbt_meshify/storage/file_manager.py +++ b/dbt_meshify/storage/file_manager.py @@ -62,6 +62,12 @@ def write_file(path: Path, content: str) -> None: """Write a string value to a file in the filesystem""" path.write_text(content) + @staticmethod + def append_file(path: Path, content: str) -> None: + """Append a string value to a file in the filesystem""" + with open(path, "a") as file: + file.write(content) + @staticmethod def copy_file(source_path: Path, target_path: Path) -> None: if not target_path.parent.exists(): diff --git a/dbt_meshify/storage/jinja_blocks.py b/dbt_meshify/storage/jinja_blocks.py new file mode 100644 index 0000000..795f061 --- /dev/null +++ b/dbt_meshify/storage/jinja_blocks.py @@ -0,0 +1,75 @@ +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Set, Tuple + + +@dataclass +class JinjaBlock: + """ + A data structure for tracking Jinja blocks of text. Includes the start and end character positions, and the content of the block + """ + + path: Path + block_type: str + name: str + start: int + end: int + content: str + + @staticmethod + def find_block_range(file_content: str, block_type: str, name: str) -> Tuple[int, int]: + """Find the line number that a block started.""" + start_line = None + end_line = None + + for match in re.finditer( + r"{%-?\s+" + block_type + r"\s+" + name + r"([(a-zA-Z0-9=,_ )]*)\s-?%}", + file_content, + re.MULTILINE, + ): + start = match.span()[0] # .span() gives tuple (start, end) + start_line = start # file_content[:start].count("\n") + break + + if start_line is None: + raise Exception(f"Unable to find a {block_type} block with the name {name}.") + + for match in re.finditer( + r"{%-?\s+end" + block_type + r"\s+-?%}", file_content, re.MULTILINE + ): + end = match.span()[1] # .span() gives tuple (start, end) + new_end_line = end # file_content[:start].count("\n") + + if new_end_line >= start_line: + end_line = new_end_line + break + + if end_line is None: + raise Exception(f"Unable to find a the closing end{block_type} block for {name}.") + + return start_line, end_line + + @staticmethod + def isolate_content(file_content: str, start: int, end: int) -> str: + """Given content, a start position, and an end position, return the content of a Jinja block.""" + return file_content[start:end] + + @classmethod + def from_file(cls, path: Path, block_type: str, name: str) -> "JinjaBlock": + """Find a specific Jinja block within a file, based on the block type and the name.""" + + file_content = path.read_text() + start, end = cls.find_block_range(file_content, block_type, name) + content = cls.isolate_content(file_content=file_content, start=start, end=end) + + return cls( + path=path, block_type=block_type, name=name, start=start, end=end, content=content + ) + + +def find_doc_reference(content: str) -> Set[str]: + """Find all doc block references within a string.""" + matches = re.findall(r"{{\sdoc\(\'?\"?([a-zA-Z0-9_\-\.]+)\'?\"?\)\s}}", content) + + return set(matches) diff --git a/test-projects/split/split_proj/macros/_macros.yml b/test-projects/split/split_proj/macros/_macros.yml index 40d3dab..1767960 100644 --- a/test-projects/split/split_proj/macros/_macros.yml +++ b/test-projects/split/split_proj/macros/_macros.yml @@ -1,3 +1,5 @@ macros: - name: cents_to_dollars - description: Converts cents to dollars \ No newline at end of file + description: Converts cents to dollars + - name: dollars_to_cents + description: Converts dollars to cents diff --git a/test-projects/split/split_proj/macros/cents_to_dollars.sql b/test-projects/split/split_proj/macros/cents_to_dollars.sql index efe3361..a48174a 100644 --- a/test-projects/split/split_proj/macros/cents_to_dollars.sql +++ b/test-projects/split/split_proj/macros/cents_to_dollars.sql @@ -3,3 +3,8 @@ {% macro cents_to_dollars(column_name, precision=2) -%} ({{ column_name }} / 100)::{{ type_numeric() }}(16, {{ precision }}) {%- endmacro %} + + +{% macro dollars_to_cents(column_name) -%} + ({{ column_name }} * 100)::{{ type_numeric() }}(16, 0) +{%- endmacro %} diff --git a/test-projects/split/split_proj/models/docs.md b/test-projects/split/split_proj/models/docs.md new file mode 100644 index 0000000..491d656 --- /dev/null +++ b/test-projects/split/split_proj/models/docs.md @@ -0,0 +1,3 @@ +{% docs customer_id %} +The unique key for each customer. +{% enddocs %} diff --git a/test-projects/split/split_proj/models/marts/__models.yml b/test-projects/split/split_proj/models/marts/__models.yml index a9823fa..685b402 100644 --- a/test-projects/split/split_proj/models/marts/__models.yml +++ b/test-projects/split/split_proj/models/marts/__models.yml @@ -2,11 +2,12 @@ version: 2 models: - name: customers - description: Customer overview data mart, offering key details for each unique + description: + Customer overview data mart, offering key details for each unique customer. One row per customer. columns: - name: customer_id - description: The unique key of the orders mart. + description: "{{ doc('customer_id') }}" tests: - not_null - unique @@ -19,20 +20,24 @@ models: - name: last_ordered_at description: The timestamp of a customer's most recent order. - name: lifetime_spend_pretax - description: The sum of all the pre-tax subtotals of every order a customer + description: + The sum of all the pre-tax subtotals of every order a customer has placed. - name: lifetime_spend - description: The sum of all the order totals (including tax) that a customer + description: + The sum of all the order totals (including tax) that a customer has ever placed. - name: customer_type - description: Options are 'new' or 'returning', indicating if a customer has + description: + Options are 'new' or 'returning', indicating if a customer has ordered more than once or has only placed their first order to date. tests: - accepted_values: values: [new, returning] - name: orders - description: Order overview data mart, offering key details for each order inlcluding + description: + Order overview data mart, offering key details for each order inlcluding if it's a customer's first order and a food vs. drink item breakdown. One row per order. tests: @@ -53,7 +58,8 @@ models: to: ref('stg_customers') field: customer_id - name: location_id - description: The foreign key relating to the location the order was placed + description: + The foreign key relating to the location the order was placed at. - name: order_total description: The total amount of the order in USD including tax. @@ -74,14 +80,14 @@ models: - name: order_cost description: The sum of supply expenses to fulfill the order. - name: location_name - description: The full location name of where this order was placed. Denormalized + description: + The full location name of where this order was placed. Denormalized from `stg_locations`. - name: is_food_order description: A boolean indicating if this order included any food items. - name: is_drink_order description: A boolean indicating if this order included any drink items. - - name: leaf_node description: A leaf node model that is not referenced by any other model. columns: @@ -89,4 +95,4 @@ models: description: The unique key of the leaf node. tests: - not_null - - unique \ No newline at end of file + - unique diff --git a/test-projects/split/split_proj/models/staging/__models.yml b/test-projects/split/split_proj/models/staging/__models.yml index 1667c52..125edb7 100644 --- a/test-projects/split/split_proj/models/staging/__models.yml +++ b/test-projects/split/split_proj/models/staging/__models.yml @@ -5,7 +5,7 @@ models: description: Customer data with basic cleaning and transformation applied, one row per customer. columns: - name: customer_id - description: The unique key for each customer. + description: "{{ doc('customer_id') }}" tests: - not_null - unique diff --git a/tests/integration/test_split_command.py b/tests/integration/test_split_command.py index bd15751..6801624 100644 --- a/tests/integration/test_split_command.py +++ b/tests/integration/test_split_command.py @@ -43,6 +43,13 @@ def test_split_one_model(self, project): ).read_text() assert x_proj_ref in child_sql + # Copied a referenced docs block + assert (Path(dest_project_path) / "my_new_project" / "models" / "docs.md").exists() + assert ( + "customer_id" + in (Path(dest_project_path) / "my_new_project" / "models" / "docs.md").read_text() + ) + def test_split_one_model_one_source(self, project): runner = CliRunner() result = runner.invoke( @@ -111,6 +118,15 @@ def test_split_one_model_one_source_custom_macro(self, project): assert ( Path(dest_project_path) / "my_new_project" / "macros" / "cents_to_dollars.sql" ).exists() + + # Confirm that we did not bring over an unreferenced dollars_to_cents macro + assert ( + "dollars_to_cents" + not in ( + Path(dest_project_path) / "my_new_project" / "macros" / "cents_to_dollars.sql" + ).read_text() + ) + # copied custom macro parents too! assert ( Path(dest_project_path) / "my_new_project" / "macros" / "type_numeric.sql" diff --git a/tests/unit/test_jinja_blocks.py b/tests/unit/test_jinja_blocks.py new file mode 100644 index 0000000..6a8e896 --- /dev/null +++ b/tests/unit/test_jinja_blocks.py @@ -0,0 +1,44 @@ +from dbt_meshify.storage.jinja_blocks import JinjaBlock + +string = """\ + + +{% docs customer_id %} +The unique key for each customer. +{% enddocs %} +""" + +multiple_blocks = """\ + + +{% docs customer_id %} +The unique key for each customer. +{% enddocs %} + +{% docs potato_name %} +The name of the customer's favorite potato dish. +{% enddocs %} +""" + + +class TestJinjaBlock: + def test_from_file_detects_block_range(self): + range = JinjaBlock.find_block_range(string, "docs", "customer_id") + assert range == (2, 72) + + def test_from_file_extracts_content(self): + content = JinjaBlock.isolate_content(string, 2, 72) + assert ( + content == "{% docs customer_id %}\nThe unique key for each customer.\n{% enddocs %}" + ) + + def test_from_file_detects_block_range_in_multiple_blocks(self): + range = JinjaBlock.find_block_range(multiple_blocks, "docs", "potato_name") + assert range == (74, 159) + + def test_from_file_extracts_content_in_files_with_multiple_blocks(self): + content = JinjaBlock.isolate_content(multiple_blocks, 74, 159) + assert ( + content + == "{% docs potato_name %}\nThe name of the customer's favorite potato dish.\n{% enddocs %}" + )