From 2eb40945564b42f9ee64e572e289d1a5e6bebaeb Mon Sep 17 00:00:00 2001 From: tricktx Date: Thu, 21 Mar 2024 14:36:08 -0300 Subject: [PATCH 1/3] update and fix data about electrical energy consumption --- .../br_mme_consumo_energia_eletrica__uf.sql | 15 ++ .../code/energia_eletrica.ipynb | 218 ++++++++++++++++++ .../schema.yml | 25 ++ 3 files changed, 258 insertions(+) create mode 100644 models/br_mme_consumo_energia_eletrica/br_mme_consumo_energia_eletrica__uf.sql create mode 100644 models/br_mme_consumo_energia_eletrica/code/energia_eletrica.ipynb create mode 100644 models/br_mme_consumo_energia_eletrica/schema.yml diff --git a/models/br_mme_consumo_energia_eletrica/br_mme_consumo_energia_eletrica__uf.sql b/models/br_mme_consumo_energia_eletrica/br_mme_consumo_energia_eletrica__uf.sql new file mode 100644 index 00000000..40f57531 --- /dev/null +++ b/models/br_mme_consumo_energia_eletrica/br_mme_consumo_energia_eletrica__uf.sql @@ -0,0 +1,15 @@ +{{ + config( + alias="uf", + schema="br_mme_consumo_energia_eletrica", + materialized="table", + ) +}} +select + safe_cast(ano as int64) ano, + safe_cast(mes as int64) mes, + safe_cast(sigla_uf as string) sigla_uf, + safe_cast(tipo_consumo as string) tipo_consumo, + safe_cast(numero_consumidores as float64) numero_consumidores, + safe_cast(consumo as float64) consumo +from `basedosdados-staging.br_mme_consumo_energia_eletrica_staging.uf` as t diff --git a/models/br_mme_consumo_energia_eletrica/code/energia_eletrica.ipynb b/models/br_mme_consumo_energia_eletrica/code/energia_eletrica.ipynb new file mode 100644 index 00000000..7706f9be --- /dev/null +++ b/models/br_mme_consumo_energia_eletrica/code/energia_eletrica.ipynb @@ -0,0 +1,218 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "def tratamento_consumo(sheet_name):\n", + " meses = {\n", + " \"JAN\": 1,\n", + " \"FEV\": 2,\n", + " \"MAR\": 3,\n", + " \"ABR\": 4,\n", + " \"MAI\": 5,\n", + " \"JUN\": 6,\n", + " \"JUL\": 7,\n", + " \"AGO\": 8,\n", + " \"SET\": 9,\n", + " \"OUT\": 10,\n", + " \"NOV\": 11,\n", + " \"DEZ\": 12\n", + " }\n", + " ufs = {'Acre':'AC', 'Alagoas':'AL', 'Amazonas':'AM', 'Amapá':'AP', 'Bahia':'BA', 'Ceará':'CE', 'Distrito Federal':'DF', 'Espírito Santo':'ES', 'Goiás':'GO', \n", + " 'Maranhão':'MA', 'Minas Gerais':'MG', 'Mato Grosso do Sul':'MS', 'Mato Grosso':'MT', 'Pará':'PA', 'Paraíba':'PB', 'Pernambuco':'PE', 'Piauí':'PI', \n", + " 'Paraná':'PR', 'Rio de Janeiro':'RJ', 'Rio Grande do Norte':'RN', 'Rondônia':'RO', 'Roraima':'RR', 'Rio Grande do Sul':'RS', \n", + " 'Santa Catarina':'SC', 'Sergipe':'SE', 'São Paulo':'SP', 'Tocantins':'TO'}\n", + " \n", + " df = pd.read_excel('/mnt/x/dados/consumo_energia_eletrica/dados.xls', sheet_name, skiprows=4, skipfooter=1, usecols='A:IG')\n", + " df_transposta = df.T\n", + " df_transposta.reset_index(inplace=True)\n", + " df_transposta.drop(df_transposta[['index', 2]], inplace=True, axis=1)\n", + " df_transposta.columns = df_transposta.iloc[0]\n", + " df_transposta = df_transposta[1:]\n", + " df_transposta.columns = ['ano', 'mes', 'Rondônia', 'Acre', 'Amazonas', 'Roraima', 'Pará',\n", + " 'Amapá', 'Tocantins', 'Maranhão', 'Piauí', 'Ceará',\n", + " 'Rio Grande do Norte', 'Paraíba', 'Pernambuco', 'Alagoas', 'Sergipe',\n", + " 'Bahia', 'Minas Gerais', 'Espírito Santo', 'Rio de Janeiro',\n", + " 'São Paulo', 'Paraná', 'Santa Catarina', 'Rio Grande do Sul',\n", + " 'Mato Grosso do Sul', 'Mato Grosso', 'Goiás', 'Distrito Federal']\n", + " df_transposta_melted = pd.melt(df_transposta, id_vars=[\"ano\", 'mes'], var_name=\"Estado\", value_name=\"Valor\")\n", + " df_transposta_melted['ano'].ffill(inplace=True)\n", + " df_transposta_melted['mes'] = df_transposta_melted['mes'].map(meses)\n", + "\n", + " df_transposta_melted.rename(columns={'Estado' : 'sigla_uf', 'Valor':'consumo'}, inplace=True) \n", + " df_transposta_melted['sigla_uf'] = df_transposta_melted['sigla_uf'].map(ufs)\n", + " return df_transposta_melted\n", + "\n", + "tipos_consumo = [\"Total\", \"Cativo\", \"Residencial\", \"Industrial\", \"Comercial\", \"Outros\"]\n", + "dfs_consumo = []\n", + "\n", + "# Realizando o tratamento para cada tipo de consumo\n", + "for i, tipo in enumerate(tipos_consumo):\n", + " if i < 6:\n", + " df = tratamento_consumo(i + 9)\n", + " df['tipo_consumo'] = tipo\n", + " dfs_consumo.append(df)\n", + "\n", + "# Concatenando todos os DataFrames\n", + "df_consumo = pd.concat(dfs_consumo)\n", + "df_consumo = df_consumo[['ano', 'mes', 'sigla_uf','tipo_consumo', 'consumo']]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(38880, 5)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_consumo.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "def tratamento_consumidores(sheet_name):\n", + " meses = {\n", + " \"JAN\": 1,\n", + " \"FEV\": 2,\n", + " \"MAR\": 3,\n", + " \"ABR\": 4,\n", + " \"MAI\": 5,\n", + " \"JUN\": 6,\n", + " \"JUL\": 7,\n", + " \"AGO\": 8,\n", + " \"SET\": 9,\n", + " \"OUT\": 10,\n", + " \"NOV\": 11,\n", + " \"DEZ\": 12\n", + " }\n", + " ufs = {'Acre':'AC', 'Alagoas':'AL', 'Amazonas':'AM', 'Amapá':'AP', 'Bahia':'BA', 'Ceará':'CE', 'Distrito Federal':'DF', 'Espírito Santo':'ES', 'Goiás':'GO', \n", + " 'Maranhão':'MA', 'Minas Gerais':'MG', 'Mato Grosso do Sul':'MS', 'Mato Grosso':'MT', 'Pará':'PA', 'Paraíba':'PB', 'Pernambuco':'PE', 'Piauí':'PI', \n", + " 'Paraná':'PR', 'Rio de Janeiro':'RJ', 'Rio Grande do Norte':'RN', 'Rondônia':'RO', 'Roraima':'RR', 'Rio Grande do Sul':'RS', \n", + " 'Santa Catarina':'SC', 'Sergipe':'SE', 'São Paulo':'SP', 'Tocantins':'TO'}\n", + " \n", + " df = pd.read_excel('/mnt/x/dados/consumo_energia_eletrica/dados.xls', sheet_name, skiprows=4, skipfooter=1, usecols='A:IG')\n", + " df_transposta = df.T\n", + " df_transposta.reset_index(inplace=True)\n", + " df_transposta.drop(df_transposta[['index', 2]], inplace=True, axis=1)\n", + " df_transposta.columns = df_transposta.iloc[0]\n", + " df_transposta = df_transposta[1:]\n", + " df_transposta.columns = ['ano', 'mes', 'Rondônia', 'Acre', 'Amazonas', 'Roraima', 'Pará',\n", + " 'Amapá', 'Tocantins', 'Maranhão', 'Piauí', 'Ceará',\n", + " 'Rio Grande do Norte', 'Paraíba', 'Pernambuco', 'Alagoas', 'Sergipe',\n", + " 'Bahia', 'Minas Gerais', 'Espírito Santo', 'Rio de Janeiro',\n", + " 'São Paulo', 'Paraná', 'Santa Catarina', 'Rio Grande do Sul',\n", + " 'Mato Grosso do Sul', 'Mato Grosso', 'Goiás', 'Distrito Federal']\n", + " df_transposta_melted = pd.melt(df_transposta, id_vars=[\"ano\", 'mes'], var_name=\"Estado\", value_name=\"Valor\")\n", + " df_transposta_melted['ano'].ffill(inplace=True)\n", + " df_transposta_melted['mes'] = df_transposta_melted['mes'].map(meses)\n", + "\n", + " df_transposta_melted.rename(columns={'Estado' : 'sigla_uf', 'Valor':'numero_consumidores'}, inplace=True) \n", + " df_transposta_melted['sigla_uf'] = df_transposta_melted['sigla_uf'].map(ufs)\n", + " return df_transposta_melted\n", + "\n", + "tipos_consumidores = ['Residencial', 'Industrial', 'Comercial', 'Outros']\n", + "dfs_consumidores = []\n", + "\n", + "# Realizando o tratamento para cada tipo de consumidores\n", + "for i, tipo in enumerate(tipos_consumidores):\n", + " if i < 6:\n", + " df = tratamento_consumidores(i + 15)\n", + " df['tipo_consumo'] = tipo\n", + " dfs_consumidores.append(df)\n", + "\n", + "# Concatenando todos os DataFrames\n", + "df_consumidores = pd.concat(dfs_consumidores)\n", + "df_consumidores = df_consumidores[['ano', 'mes', 'sigla_uf','tipo_consumo', 'numero_consumidores']]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(25920, 5)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_consumidores.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "df_total = pd.merge(df_consumo, df_consumidores, how= 'left', on=['ano', 'mes', 'sigla_uf', 'tipo_consumo'])\n", + "df_total = df_total[['ano', 'mes', 'sigla_uf', 'tipo_consumo', 'numero_consumidores', 'consumo']]\n", + "df_total['consumo'] = df_total['consumo'].astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "df_total.to_csv(\"/mnt/x/dados/consumo_energia_eletrica/consumo_energia.csv\", sep=',', index=False, encoding='utf-8')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/models/br_mme_consumo_energia_eletrica/schema.yml b/models/br_mme_consumo_energia_eletrica/schema.yml new file mode 100644 index 00000000..10e0124b --- /dev/null +++ b/models/br_mme_consumo_energia_eletrica/schema.yml @@ -0,0 +1,25 @@ +--- +version: 2 +models: + - name: br_mme_consumo_energia_eletrica__uf + description: Consumo de energia elétrica em nível nacional e segmentado pelas + classes residencial, industrial, comercial e outros (rural, serviço público + e iluminação pública). + tests: + - not_null_proportion_multiple_columns: + at_least: 0.05 + - dbt_utils.unique_combination_of_columns: + combination_of_columns: [ano, mes, sigla_uf, tipo_consumo] + columns: + - name: ano + description: Ano + - name: mes + description: Mês + - name: sigla_uf + description: Sigla da Unidade da Federação + - name: tipo_consumo + description: Tipo de Consumo + - name: numero_consumidores + description: Número de consumidores de energia elétrica atendidos pela rede + - name: consumo + description: Consumo de energia elétrica na rede (MWh) From 7476c37cc9640223145a400fbfcfcce166614bf8 Mon Sep 17 00:00:00 2001 From: tricktx Date: Thu, 21 Mar 2024 14:42:08 -0300 Subject: [PATCH 2/3] add dbt_project --- dbt_project.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dbt_project.yml b/dbt_project.yml index 02fcfc26..6754974a 100644 --- a/dbt_project.yml +++ b/dbt_project.yml @@ -228,6 +228,9 @@ models: +post-hook: - REVOKE `roles/bigquery.dataViewer` ON TABLE {{ this }} FROM "specialGroup:allUsers" - GRANT `roles/bigquery.dataViewer` ON TABLE {{ this }} TO "group:bd-pro@basedosdados.org" + br_mme_consumo_energia_eletrica: + +materialized: table + +schema: br_mme_consumo_energia_eletrica br_mp_pep: +materialized: table +schema: br_mp_pep From 62b81763564baf0006baa270892073879b747859 Mon Sep 17 00:00:00 2001 From: tricktx Date: Fri, 22 Mar 2024 13:02:37 -0300 Subject: [PATCH 3/3] fix colum numero_consumidores --- .user.yml | 2 + .../br_mme_consumo_energia_eletrica__uf.sql | 16 +- .../code/energia_eletrica.ipynb | 196 ++++++++++++++++-- .../schema.yml | 13 ++ 4 files changed, 200 insertions(+), 27 deletions(-) create mode 100644 .user.yml mode change 100644 => 100755 models/br_mme_consumo_energia_eletrica/code/energia_eletrica.ipynb diff --git a/.user.yml b/.user.yml new file mode 100644 index 00000000..416339dd --- /dev/null +++ b/.user.yml @@ -0,0 +1,2 @@ +--- +id: cc3f54e0-fd01-4495-bd12-aa41f3b24444 diff --git a/models/br_mme_consumo_energia_eletrica/br_mme_consumo_energia_eletrica__uf.sql b/models/br_mme_consumo_energia_eletrica/br_mme_consumo_energia_eletrica__uf.sql index 40f57531..620f7a43 100644 --- a/models/br_mme_consumo_energia_eletrica/br_mme_consumo_energia_eletrica__uf.sql +++ b/models/br_mme_consumo_energia_eletrica/br_mme_consumo_energia_eletrica__uf.sql @@ -6,10 +6,14 @@ ) }} select - safe_cast(ano as int64) ano, - safe_cast(mes as int64) mes, - safe_cast(sigla_uf as string) sigla_uf, - safe_cast(tipo_consumo as string) tipo_consumo, - safe_cast(numero_consumidores as float64) numero_consumidores, - safe_cast(consumo as float64) consumo + safe_cast(ano as int64) as ano, + safe_cast(mes as int64) as mes, + safe_cast(sigla_uf as string) as sigla_uf, + safe_cast(tipo_consumo as string) as tipo_consumo, + case + when numero_consumidores = '0' + then null + else safe_cast(numero_consumidores as int64) + end as numero_consumidores, + safe_cast(consumo as int64) as consumo from `basedosdados-staging.br_mme_consumo_energia_eletrica_staging.uf` as t diff --git a/models/br_mme_consumo_energia_eletrica/code/energia_eletrica.ipynb b/models/br_mme_consumo_energia_eletrica/code/energia_eletrica.ipynb old mode 100644 new mode 100755 index 7706f9be..0aff9a8c --- a/models/br_mme_consumo_energia_eletrica/code/energia_eletrica.ipynb +++ b/models/br_mme_consumo_energia_eletrica/code/energia_eletrica.ipynb @@ -2,18 +2,19 @@ "cells": [ { "cell_type": "code", - "execution_count": 41, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import warnings\n", - "warnings.filterwarnings('ignore')" + "warnings.filterwarnings('ignore')\n", + "import numpy as np" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -74,7 +75,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -83,7 +84,7 @@ "(38880, 5)" ] }, - "execution_count": 13, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -94,7 +95,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -155,33 +156,186 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "df_total = pd.merge(df_consumo, df_consumidores, how= 'left', on=['ano', 'mes', 'sigla_uf', 'tipo_consumo'])\n", + "df_total = df_total[['ano', 'mes', 'sigla_uf', 'tipo_consumo', 'numero_consumidores', 'consumo']]\n", + "df_total['consumo'] = df_total['consumo'].astype(int)\n", + "df_total['numero_consumidores'] = df_total['numero_consumidores'].fillna(0).astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
anomessigla_uftipo_consumonumero_consumidoresconsumo
1584020041PEResidencial2001833228466
1584120042PEResidencial2004750217764
1584220043PEResidencial2018407225066
1584320044PEResidencial2026995232170
1584420045PEResidencial1972048210365
1584520046PEResidencial1963599198030
1584620047PEResidencial1954839192699
1584720048PEResidencial1949125189991
1584820049PEResidencial1989788224208
15849200410PEResidencial2022621212547
15850200411PEResidencial2046330226619
15851200412PEResidencial2040413240852
\n", + "
" + ], "text/plain": [ - "(25920, 5)" + " ano mes sigla_uf tipo_consumo numero_consumidores consumo\n", + "15840 2004 1 PE Residencial 2001833 228466\n", + "15841 2004 2 PE Residencial 2004750 217764\n", + "15842 2004 3 PE Residencial 2018407 225066\n", + "15843 2004 4 PE Residencial 2026995 232170\n", + "15844 2004 5 PE Residencial 1972048 210365\n", + "15845 2004 6 PE Residencial 1963599 198030\n", + "15846 2004 7 PE Residencial 1954839 192699\n", + "15847 2004 8 PE Residencial 1949125 189991\n", + "15848 2004 9 PE Residencial 1989788 224208\n", + "15849 2004 10 PE Residencial 2022621 212547\n", + "15850 2004 11 PE Residencial 2046330 226619\n", + "15851 2004 12 PE Residencial 2040413 240852" ] }, - "execution_count": 18, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df_consumidores.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "df_total = pd.merge(df_consumo, df_consumidores, how= 'left', on=['ano', 'mes', 'sigla_uf', 'tipo_consumo'])\n", - "df_total = df_total[['ano', 'mes', 'sigla_uf', 'tipo_consumo', 'numero_consumidores', 'consumo']]\n", - "df_total['consumo'] = df_total['consumo'].astype(int)" + "df_total[(df_total['ano'] == 2004) & (df_total['sigla_uf'] == 'PE') & (df_total['tipo_consumo'] == 'Residencial')]" ] }, { diff --git a/models/br_mme_consumo_energia_eletrica/schema.yml b/models/br_mme_consumo_energia_eletrica/schema.yml index 10e0124b..c20bb4e3 100644 --- a/models/br_mme_consumo_energia_eletrica/schema.yml +++ b/models/br_mme_consumo_energia_eletrica/schema.yml @@ -13,13 +13,26 @@ models: columns: - name: ano description: Ano + tests: + - relationships: + to: ref('br_bd_diretorios_data_tempo__ano') + field: ano.ano - name: mes description: Mês + tests: + - relationships: + to: ref('br_bd_diretorios_data_tempo__mes') + field: mes.mes - name: sigla_uf description: Sigla da Unidade da Federação + tests: + - relationships: + to: ref('br_bd_diretorios_brasil__uf') + field: sigla - name: tipo_consumo description: Tipo de Consumo - name: numero_consumidores description: Número de consumidores de energia elétrica atendidos pela rede - name: consumo description: Consumo de energia elétrica na rede (MWh) + tests: [not_null]