From bc627afaffa97e2450ce6489a22bf4ad26e2d73f Mon Sep 17 00:00:00 2001 From: Vinicius Aguiar Date: Sun, 4 Feb 2024 13:15:52 -0300 Subject: [PATCH] feat: add page views metadata (#557) --- ...027_dataset_page_views_table_page_views.py | 28 ++++++++++ bd_api/apps/api/v1/models.py | 10 +++- bd_api/apps/api/v1/tasks.py | 55 ++++++++++++++++++- 3 files changed, 91 insertions(+), 2 deletions(-) create mode 100644 bd_api/apps/api/v1/migrations/0027_dataset_page_views_table_page_views.py diff --git a/bd_api/apps/api/v1/migrations/0027_dataset_page_views_table_page_views.py b/bd_api/apps/api/v1/migrations/0027_dataset_page_views_table_page_views.py new file mode 100644 index 00000000..5518edb7 --- /dev/null +++ b/bd_api/apps/api/v1/migrations/0027_dataset_page_views_table_page_views.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- +# Generated by Django 4.2.6 on 2024-02-04 16:08 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("v1", "0026_alter_table_source_bucket_name"), + ] + + operations = [ + migrations.AddField( + model_name="dataset", + name="page_views", + field=models.BigIntegerField( + default=0, help_text="Number of page views by Google Analytics" + ), + ), + migrations.AddField( + model_name="table", + name="page_views", + field=models.BigIntegerField( + default=0, help_text="Number of page views by Google Analytics" + ), + ), + ] diff --git a/bd_api/apps/api/v1/models.py b/bd_api/apps/api/v1/models.py index 8eb835a4..24326f62 100644 --- a/bd_api/apps/api/v1/models.py +++ b/bd_api/apps/api/v1/models.py @@ -551,6 +551,10 @@ class Dataset(BaseModel): is_closed = models.BooleanField( default=False, help_text="Dataset is for BD Pro subscribers only" ) + page_views = models.BigIntegerField( + default=0, + help_text="Number of page views by Google Analytics", + ) graphql_nested_filter_fields_whitelist = ["id", "slug"] @@ -946,8 +950,12 @@ class Table(BaseModel, OrderedModel): number_rows = models.BigIntegerField(blank=True, null=True) number_columns = models.BigIntegerField(blank=True, null=True) is_closed = models.BooleanField(default=False, help_text="Table is for BD Pro subscribers only") - order_with_respect_to = ("dataset",) + page_views = models.BigIntegerField( + default=0, + help_text="Number of page views by Google Analytics", + ) + order_with_respect_to = ("dataset",) graphql_nested_filter_fields_whitelist = ["id", "dataset"] def __str__(self): diff --git a/bd_api/apps/api/v1/tasks.py b/bd_api/apps/api/v1/tasks.py index 61144805..91a03adf 100644 --- a/bd_api/apps/api/v1/tasks.py +++ b/bd_api/apps/api/v1/tasks.py @@ -1,4 +1,6 @@ # -*- coding: utf-8 -*- +from datetime import datetime, timedelta + from django.core.management import call_command from google.api_core.exceptions import BadRequest, NotFound from google.cloud.bigquery import Table as GBQTable @@ -7,7 +9,7 @@ from loguru import logger from pandas import read_gbq -from bd_api.apps.api.v1.models import Table +from bd_api.apps.api.v1.models import Dataset, Table from bd_api.custom.client import get_gbq_client, get_gcs_client, send_discord_message from bd_api.utils import production_task @@ -121,3 +123,54 @@ def format_msg(msg: list[str]) -> str: if msg := format_msg(msg): send_discord_message(msg) + + +@periodic_task(crontab(hour="6", minute="0")) +@production_task +def update_page_views_task(backfill: bool = False): + if backfill: + event_table = "events_*" + else: + yesterday = datetime.now() - timedelta(1) + yesterday = yesterday.strftime("%Y%m%d") + event_table = f"events_{yesterday}" + + query = f""" + select + count(1) page_views + , regexp_extract(param.value.string_value, r'table=([a-z0-9-]{36})') table_id + , regexp_extract(param.value.string_value, r'dataset\/([a-z0-9-]{36})') dataset_id + from `basedosdados.analytics_295884852.{event_table}` event + join unnest(event_params) param + where + true + and event_name = 'page_view' + and param.key = 'page_location' + and param.value.string_value like '%/dataset/%' + group by + table_id, + dataset_id + having + true + and table_id is not null + and dataset_id is not null + """ # noqa: W605 + metadata = read_gbq(query) + + if backfill: + for table_id in metadata["table_id"].unique(): + if table := Table.objects.filter(id=table_id).first(): + table.page_views = 0 + table.save() + for dataset_id in metadata["dataset_id"].unique(): + if dataset := Dataset.objects.filter(id=dataset_id).first(): + dataset.page_views = 0 + dataset.save() + + for _, (page_views, table_id, dataset_id) in metadata.iterrows(): + if table := Table.objects.filter(id=table_id).first(): + table.page_views += page_views + table.save() + if dataset := Dataset.objects.filter(id=dataset_id).first(): + dataset.page_views += page_views + dataset.save()