Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added validate_df task to SupermetricsToADLS flow #785

Merged
merged 1 commit into from
Oct 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 90 additions & 1 deletion tests/integration/flows/test_supermetrics_to_adls.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
from prefect.storage import Local

from viadot.flows import SupermetricsToADLS
from viadot.exceptions import ValidationError

CWD = os.getcwd()
adls_dir_path = "raw/supermetrics"
adls_dir_path = "raw/tests/supermetrics"
STORAGE = Local(path=CWD)

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -110,3 +111,91 @@ def test_supermetrics_to_adls_file_name(expectation_suite):
)
result = flow.run()
assert result.is_successful()


def test_supermetrics_to_adls_validate_df_success(expectation_suite):
flow = SupermetricsToADLS(
"test_supermetrics_to_adls_validate_df_success",
ds_id="GA",
ds_segments=[
"R1fbzFNQQ3q_GYvdpRr42w",
"I8lnFFvdSFKc50lP7mBKNA",
"Lg7jR0VWS5OqGPARtGYKrw",
"h8ViuGLfRX-cCL4XKk6yfQ",
"-1",
],
ds_accounts=["8326007", "58338899"],
date_range_type="last_year_inc",
fields=[
{"id": "Date"},
{"id": "segment", "split": "column"},
{"id": "AvgPageLoadTime_calc"},
],
settings={"avoid_sampling": "true"},
order_columns="alphabetic",
max_columns=100,
max_rows=10,
expectation_suite=expectation_suite,
evaluation_parameters=dict(previous_run_row_count=9),
adls_dir_path=adls_dir_path,
parallel=False,
validate_df_dict={
"column_size": {"Date": 10},
"column_list_to_match": [
"Date",
"All Users",
"M-Site_Better Space: All Landing Page Sessions",
"M-site_Accessories: All Landing Page Sessions",
"M-site_More Space: All Landing Page Sessions",
"M-site_Replacement: All Landing Page Sessions",
],
},
)
result = flow.run()
assert result.is_successful()

task_results = result.result.values()
assert all([task_result.is_successful() for task_result in task_results])


def test_supermetrics_to_adls_validate_df_fail(expectation_suite):
flow = SupermetricsToADLS(
"test_supermetrics_to_adls_validate_df_fail",
ds_id="GA",
ds_segments=[
"R1fbzFNQQ3q_GYvdpRr42w",
"I8lnFFvdSFKc50lP7mBKNA",
"Lg7jR0VWS5OqGPARtGYKrw",
"h8ViuGLfRX-cCL4XKk6yfQ",
"-1",
],
ds_accounts=["8326007", "58338899"],
date_range_type="last_year_inc",
fields=[
{"id": "Date"},
{"id": "segment", "split": "column"},
{"id": "AvgPageLoadTime_calc"},
],
settings={"avoid_sampling": "true"},
order_columns="alphabetic",
max_columns=100,
max_rows=10,
expectation_suite=expectation_suite,
evaluation_parameters=dict(previous_run_row_count=9),
adls_dir_path=adls_dir_path,
parallel=False,
validate_df_dict={
"column_list_to_match": [
"All Users",
"All Landing Page Sessions",
"All Landing Page Sessions",
"All Landing Page Sessions",
"All Landing Page Sessions",
],
},
)

try:
flow.run()
except ValidationError:
pass
14 changes: 14 additions & 0 deletions viadot/flows/supermetrics_to_adls.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
union_dfs_task,
update_dtypes_dict,
write_to_json,
validate_df,
)
from viadot.tasks import (
AzureDataLakeUpload,
Expand Down Expand Up @@ -68,6 +69,7 @@ def __init__(
vault_name: str = None,
check_missing_data: bool = True,
timeout: int = 3600,
validate_df_dict: dict = None,
*args: List[any],
**kwargs: Dict[str, Any],
):
Expand Down Expand Up @@ -112,6 +114,8 @@ def __init__(
check_missing_data (bool, optional): Whether to check missing data. Defaults to True.
timeout(int, optional): The amount of time (in seconds) to wait while running this task before
a timeout occurs. Defaults to 3600.
validate_df_dict (Dict[str], optional): A dictionary with optional list of tests to verify the output dataframe.
If defined, triggers the `validate_df` task from task_utils. Defaults to None.
"""
if not ds_user:
try:
Expand Down Expand Up @@ -140,6 +144,9 @@ def __init__(
self.if_exists = if_exists
self.output_file_extension = output_file_extension

# validate_df
self.validate_df_dict = validate_df_dict

# RunGreatExpectationsValidation
self.expectation_suite = expectation_suite
self.expectations_path = "/home/viadot/tmp/expectations"
Expand Down Expand Up @@ -229,6 +236,13 @@ def gen_flow(self) -> Flow:
else:
df = self.gen_supermetrics_task(ds_accounts=self.ds_accounts, flow=self)

# run validate_df task from task_utils
if self.validate_df_dict:
validation_df_task = validate_df.bind(
df, tests=self.validate_df_dict, flow=self
)
validation_df_task.set_upstream(df, flow=self)

write_json = write_to_json.bind(
dict_=self.expectation_suite,
path=os.path.join(
Expand Down
Loading