diff --git a/.github/workflows/scripts/table_test.py b/.github/workflows/scripts/table_test.py new file mode 100644 index 00000000..de8fbc42 --- /dev/null +++ b/.github/workflows/scripts/table_test.py @@ -0,0 +1,236 @@ +from argparse import ArgumentParser +from time import sleep +import re +from backend import Backend +from utils import expand_alls, get_datasets_tables_from_modified_files + + +def get_flow_run_state(flow_run_id: str, backend: Backend, auth_token: str): + query = """ + query ($flow_run_id: uuid!) { + flow_run_by_pk (id: $flow_run_id) { + state + } + } + """ + response = backend._execute_query( + query, + variables={"flow_run_id": flow_run_id}, + headers={"Authorization": f"Bearer {auth_token}"}, + ) + return response["flow_run_by_pk"]["state"] + +def get_flow_status_logs(flow_run_id: str, backend: Backend, auth_token: str): + query = """query ($flow_run_id: uuid!){ + log(where:{ + flow_run_id:{_eq:$flow_run_id}, + message:{_like:"%Done.%"}}){ + message + } + }""" + response = backend._execute_query( + query, + variables={"flow_run_id": flow_run_id}, + headers={"Authorization": f"Bearer {auth_token}"}, + ) + print(response) + message = response['log']['message'] + result = {} + result['pass'] = int(re.findall("PASS=\d+", message)[0].split('=')[1]) + result['skip'] = int(re.findall("SKIP=\d+", message)[0].split('=')[1]) + result['warn'] = int(re.findall("WARN=\d+", message)[0].split('=')[1]) + + return result + + +def get_materialization_flow_id(backend: Backend, auth_token: str): + query = """ + query { + flow (where: { + name: { + _like: "BD template: Executa DBT model" + }, + archived: { + _eq: false + }, + project: { + name: {_eq: "main"} + } + }) { + id + } + } + """ + response = backend._execute_query( + query, headers={"Authorization": f"Bearer {auth_token}"} + ) + return response["flow"][0]["id"] + + +if __name__ == "__main__": + # Start argument parser + arg_parser = ArgumentParser() + + # Add GraphQL URL argument + arg_parser.add_argument( + "--graphql-url", + type=str, + required=True, + help="URL of the GraphQL endpoint.", + ) + + # Add list of modified files argument + arg_parser.add_argument( + "--modified-files", + type=str, + required=True, + help="List of modified files.", + ) + + + # Add Prefect backend URL argument + arg_parser.add_argument( + "--prefect-backend-url", + type=str, + required=False, + default="https://prefect.basedosdados.org/api", + help="Prefect backend URL.", + ) + + # Add prefect base URL argument + arg_parser.add_argument( + "--prefect-base-url", + type=str, + required=False, + default="https://prefect.basedosdados.org", + help="Prefect base URL.", + ) + + # Add Prefect API token argument + arg_parser.add_argument( + "--prefect-backend-token", + type=str, + required=True, + help="Prefect backend token.", + ) + + # Add materialization mode argument + arg_parser.add_argument( + "--materialization-mode", + type=str, + required=False, + default="dev", + help="Materialization mode.", + ) + + # Add materialization label argument + arg_parser.add_argument( + "--materialization-label", + type=str, + required=False, + default="basedosdados-dev", + help="Materialization label.", + ) + + # Add dbt command label argument + arg_parser.add_argument( + "--dbt-command", + type=str, + required=False, + default = "test", + help="Materialization label.", + ) + + # Get arguments + args = arg_parser.parse_args() + + # Get datasets and tables from modified files + modified_files = args.modified_files.split(",") + datasets_tables = get_datasets_tables_from_modified_files( + modified_files, show_details=True + ) + # Split deleted datasets and tables + deleted_datasets_tables = [] + existing_datasets_tables = [] + for dataset_id, table_id, exists, alias in datasets_tables: + if exists: + existing_datasets_tables.append((dataset_id, table_id, alias)) + else: + deleted_datasets_tables.append((dataset_id, table_id, alias)) + # Expand `__all__` tables + backend = Backend(args.graphql_url) + expanded_existing_datasets_tables = [] + for dataset_id, table_id, alias in existing_datasets_tables: + expanded_table_ids = expand_alls(dataset_id, table_id, backend) + for expanded_dataset_id, expanded_table_id in expanded_table_ids: + expanded_existing_datasets_tables.append( + (expanded_dataset_id, expanded_table_id, alias) + ) + existing_datasets_tables = expanded_existing_datasets_tables + + # Launch materialization flows + backend = Backend(args.prefect_backend_url) + flow_id = get_materialization_flow_id(backend, args.prefect_backend_token) + launched_flow_run_ids = [] + for dataset_id, table_id, alias in existing_datasets_tables: + print( + f"Launching materialization flow for {dataset_id}.{table_id} (alias={alias})..." + ) + parameters = { + "dataset_id": dataset_id, + "dbt_alias": alias, + "mode": args.materialization_mode, + "table_id": table_id, + "dbt_command": args.dbt_command + } + + mutation = """ + mutation ($flow_id: UUID, $parameters: JSON, $label: String!) { + create_flow_run (input: { + flow_id: $flow_id, + parameters: $parameters, + labels: [$label], + }) { + id + } + } + """ + variables = { + "flow_id": flow_id, + "parameters": parameters, + "label": args.materialization_label, + } + + response = backend._execute_query( + mutation, + variables, + headers={"Authorization": f"Bearer {args.prefect_backend_token}"}, + ) + + flow_run_id = response["create_flow_run"]["id"] + launched_flow_run_ids.append(flow_run_id) + flow_run_url = f"{args.prefect_base_url}/flow-run/{flow_run_id}" + print(f" - Materialization flow run launched: {flow_run_url}") + + # Keep monitoring the launched flow runs until they are finished + for launched_flow_run_id in launched_flow_run_ids: + print(f"Monitoring flow run {launched_flow_run_id}...") + flow_run_state = get_flow_run_state( + flow_run_id=launched_flow_run_id, + backend=backend, + auth_token=args.prefect_backend_token, + ) + while flow_run_state not in ["Success", "Failed", "Cancelled"]: + sleep(5) + flow_run_state = get_flow_run_state( + flow_run_id=launched_flow_run_id, + backend=backend, + auth_token=args.prefect_backend_token, + ) + if flow_run_state != "Success": + raise Exception( + f'Flow run {launched_flow_run_id} finished with state "{flow_run_state}". ' + f"Check the logs at {args.prefect_base_url}/flow-run/{launched_flow_run_id}" + ) + else: + print("Congrats! Everything seems fine!") diff --git a/.github/workflows/test_dbt_model.yaml b/.github/workflows/test_dbt_model.yaml new file mode 100644 index 00000000..5111e0d1 --- /dev/null +++ b/.github/workflows/test_dbt_model.yaml @@ -0,0 +1,34 @@ +--- +name: Test DBT model +on: + pull_request: + types: [labeled, opened] + branches: [main] + paths: [models/**, .github/workflows/test_dbt_model.yaml] +jobs: + test_dbt_model: + if: contains(github.event.pull_request.labels.*.name, 'test-dev-model') + name: Test DBT dev model + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + ref: ${{ github.head_ref }} + - name: Get all changed files using a comma separator + id: changed-files + uses: tj-actions/changed-files@v35 + with: + separator: ',' + - name: Set up poetry + run: pipx install poetry + - name: Set up python + uses: actions/setup-python@v4 + with: + cache: poetry + python-version: '3.9' + - name: Install requirements + run: poetry install --only=dev + - name: Run script to test DBT model + run: |- + poetry run python .github/workflows/scripts/table_test.py --modified-files ${{ steps.changed-files.outputs.all_modified_files }} --graphql-url ${{ secrets.BACKEND_GRAPHQL_URL }} --prefect-backend-token ${{ secrets.PREFECT_BACKEND_TOKEN }} diff --git a/.user.yml b/.user.yml new file mode 100644 index 00000000..416339dd --- /dev/null +++ b/.user.yml @@ -0,0 +1,2 @@ +--- +id: cc3f54e0-fd01-4495-bd12-aa41f3b24444 diff --git a/dbt_project.yml b/dbt_project.yml index 02fcfc26..4f56fc28 100644 --- a/dbt_project.yml +++ b/dbt_project.yml @@ -228,12 +228,18 @@ models: +post-hook: - REVOKE `roles/bigquery.dataViewer` ON TABLE {{ this }} FROM "specialGroup:allUsers" - GRANT `roles/bigquery.dataViewer` ON TABLE {{ this }} TO "group:bd-pro@basedosdados.org" + br_mme_consumo_energia_eletrica: + +materialized: table + +schema: br_mme_consumo_energia_eletrica br_mp_pep: +materialized: table +schema: br_mp_pep br_ms_cnes: +materialized: table +schema: br_ms_cnes + br_ms_sia: + +materialized: table + +schema: br_ms_sia br_ms_sim: +materialized: table +schema: br_ms_sim diff --git a/models/br_camara_dados_abertos/br_camara_dados_abertos__orgao_deputado.sql b/models/br_camara_dados_abertos/br_camara_dados_abertos__orgao_deputado.sql index f5f82eda..a1770fe0 100644 --- a/models/br_camara_dados_abertos/br_camara_dados_abertos__orgao_deputado.sql +++ b/models/br_camara_dados_abertos/br_camara_dados_abertos__orgao_deputado.sql @@ -1,12 +1,25 @@ {{ config(alias="orgao_deputado", schema="br_camara_dados_abertos") }} -select distinct - regexp_extract(uriorgao, r'/orgaos/(\d+)') as id_orgao, - safe_cast(nomeorgao as string) nome, - safe_cast(siglaorgao as string) sigla, - safe_cast(nomedeputado as string) nome_deputado, - safe_cast(cargo as string) cargo, - safe_cast(siglauf as string) sigla_uf, - safe_cast(datainicio as date) data_inicio, - safe_cast(datafim as date) data_final, - safe_cast(siglapartido as string) sigla_partido, -from `basedosdados-staging.br_camara_dados_abertos_staging.orgao_deputado` as t +with + orgao_deputado as ( + select distinct + regexp_extract(uriorgao, r'/orgaos/(\d+)') as id_orgao, + safe_cast(nomeorgao as string) nome, + safe_cast(siglaorgao as string) sigla, + safe_cast(nomedeputado as string) nome_deputado, + safe_cast(cargo as string) cargo, + safe_cast(siglauf as string) sigla_uf, + safe_cast(datainicio as date) data_inicio, + safe_cast(datafim as date) data_final, + safe_cast(siglapartido as string) sigla_partido, + from `basedosdados-staging.br_camara_dados_abertos_staging.orgao_deputado` + ) +select * +from orgao_deputado +where + not ( + nome_deputado = 'Hélio Leite' + and cargo = 'Titular' + and sigla_uf is null + and data_inicio = '2022-05-03' + and data_final = '2023-02-01' + ) diff --git a/models/br_camara_dados_abertos/br_camara_dados_abertos__proposicao_autor.sql b/models/br_camara_dados_abertos/br_camara_dados_abertos__proposicao_autor.sql index c6214862..71387eca 100644 --- a/models/br_camara_dados_abertos/br_camara_dados_abertos__proposicao_autor.sql +++ b/models/br_camara_dados_abertos/br_camara_dados_abertos__proposicao_autor.sql @@ -1,6 +1,6 @@ {{ config(alias="proposicao_autor", schema="br_camara_dados_abertos") }} -select +select distinct safe_cast(idproposicao as string) id_proposicao, replace(safe_cast(iddeputadoautor as string), ".0", "") id_deputado, initcap(safe_cast(tipoautor as string)) tipo_autor, diff --git a/models/br_camara_dados_abertos/br_camara_dados_abertos__proposicao_microdados.sql b/models/br_camara_dados_abertos/br_camara_dados_abertos__proposicao_microdados.sql index f99989ff..44f0c262 100644 --- a/models/br_camara_dados_abertos/br_camara_dados_abertos__proposicao_microdados.sql +++ b/models/br_camara_dados_abertos/br_camara_dados_abertos__proposicao_microdados.sql @@ -6,43 +6,88 @@ partition_by={ "field": "ano", "data_type": "INT64", - "range": {"start": 1935, "end": 2023, "interval": 1}, + "range": {"start": 1935, "end": 2024, "interval": 1}, }, ) }} -select - safe_cast(ano as int64) ano, - safe_cast( - split( - format_timestamp('%Y-%m-%dT%H:%M:%E*S', timestamp(dataapresentacao)), 'T' - )[offset(0)] as date - ) data, - safe_cast( - split( - format_timestamp('%Y-%m-%dT%H:%M:%E*S', timestamp(dataapresentacao)), 'T' - )[offset(1)] as time - ) horario, - safe_cast(id as string) id_proposicao, - safe_cast(uri as string) url, - safe_cast(numero as string) numero, - safe_cast(siglatipo as string) sigla, - safe_cast(descricaotipo as string) tipo, - safe_cast(ementa as string) ementa, - safe_cast(ementadetalhada as string) ementa_detalhada, - safe_cast(keywords as string) palavra_chave, - safe_cast(uriorgaonumerador as string) url_orgao_numerador, - safe_cast(uripropprincipal as string) url_principal, - safe_cast(uripropposterior as string) url_posterior, - safe_cast(urlinteiroteor as string) url_teor_proposicao, - safe_cast(ultimostatus_datahora as string) data_hora_ultimo_status, - safe_cast(ultimostatus_urirelator as string) url_relator_ultimo_status, - safe_cast(ultimostatus_siglaorgao as string) sigla_orgao_ultimo_status, - safe_cast(ultimostatus_regime as string) regime_ultimo_status, - safe_cast(ultimostatus_descricaotramitacao as string) tramitacao_ultimo_status, - safe_cast(ultimostatus_descricaosituacao as string) situacao_ultimo_status, - safe_cast(ultimostatus_despacho as string) despacho_ultimo_status, - safe_cast(ultimostatus_apreciacao as string) apreciacao_ultimo_status, - safe_cast(ultimostatus_sequencia as string) sequencia_ultimo_status, - safe_cast(ultimostatus_url as string) url_ultimo_status, -from `basedosdados-staging.br_camara_dados_abertos_staging.proposicao_microdados` as t +with + table as ( + select + safe_cast(ano as int64) ano, + safe_cast( + split( + format_timestamp( + '%Y-%m-%dT%H:%M:%E*S', timestamp(dataapresentacao) + ), + 'T' + )[offset (0)] as date + ) data, + safe_cast( + split( + format_timestamp( + '%Y-%m-%dT%H:%M:%E*S', timestamp(dataapresentacao) + ), + 'T' + )[offset (1)] as time + ) horario, + safe_cast(id as string) id_proposicao, + safe_cast(uri as string) url, + safe_cast(numero as string) numero, + safe_cast(siglatipo as string) sigla, + safe_cast(descricaotipo as string) tipo, + safe_cast(ementa as string) ementa, + safe_cast(ementadetalhada as string) ementa_detalhada, + safe_cast(keywords as string) palavra_chave, + safe_cast(uriorgaonumerador as string) url_orgao_numerador, + safe_cast(uripropprincipal as string) url_principal, + safe_cast(uripropposterior as string) url_posterior, + safe_cast(urlinteiroteor as string) url_teor_proposicao, + safe_cast(ultimostatus_datahora as string) data_hora_ultimo_status, + safe_cast(ultimostatus_urirelator as string) url_relator_ultimo_status, + safe_cast(ultimostatus_siglaorgao as string) sigla_orgao_ultimo_status, + safe_cast(ultimostatus_regime as string) regime_ultimo_status, + safe_cast( + ultimostatus_descricaotramitacao as string + ) tramitacao_ultimo_status, + safe_cast(ultimostatus_descricaosituacao as string) situacao_ultimo_status, + safe_cast(ultimostatus_despacho as string) despacho_ultimo_status, + safe_cast(ultimostatus_apreciacao as string) apreciacao_ultimo_status, + safe_cast(ultimostatus_sequencia as string) sequencia_ultimo_status, + safe_cast(ultimostatus_url as string) url_ultimo_status, + from + `basedosdados-staging.br_camara_dados_abertos_staging.proposicao_microdados` + as t + ), + query_total as ( + select + ano, + case when data >= current_date() then null else data end as data, + horario, + id_proposicao, + url, + numero, + sigla, + tipo, + ementa, + ementa_detalhada, + palavra_chave, + url_orgao_numerador, + url_principal, + url_posterior, + url_teor_proposicao, + data_hora_ultimo_status, + url_relator_ultimo_status, + sigla_orgao_ultimo_status, + regime_ultimo_status, + tramitacao_ultimo_status, + situacao_ultimo_status, + despacho_ultimo_status, + apreciacao_ultimo_status, + sequencia_ultimo_status, + url_ultimo_status, + from table + ) +select distinct * +from query_total +where not (ano = 2011 and id_proposicao = '510035') diff --git a/models/br_camara_dados_abertos/br_camara_dados_abertos__proposicao_tema.sql b/models/br_camara_dados_abertos/br_camara_dados_abertos__proposicao_tema.sql index 10d0cc64..63bb9148 100644 --- a/models/br_camara_dados_abertos/br_camara_dados_abertos__proposicao_tema.sql +++ b/models/br_camara_dados_abertos/br_camara_dados_abertos__proposicao_tema.sql @@ -11,11 +11,17 @@ ) }} -select - safe_cast(replace(ano, ".0", "") as int64) ano, - regexp_extract(uriproposicao, r'/proposicoes/(\d+)') as id_proposicao, - safe_cast(siglatipo as string) tipo_proposicao, - safe_cast(numero as string) numero, - safe_cast(codtema as string) tema, - safe_cast(relevancia as int64) relevancia, -from `basedosdados-staging.br_camara_dados_abertos_staging.proposicao_tema` as t +with + tables as ( + select + safe_cast(replace(ano, ".0", "") as int64) as ano, + regexp_extract(uriproposicao, r'/proposicoes/(\d+)') as id_proposicao, + safe_cast(siglatipo as string) as tipo_proposicao, + safe_cast(numero as string) as numero, + safe_cast(tema as string) as tema, + safe_cast(relevancia as int64) as relevancia + from `basedosdados-staging.br_camara_dados_abertos_staging.proposicao_tema` + ) +select * +from tables +where not (ano = 2011 and id_proposicao = '510035') diff --git a/models/br_ibge_censo_2022/br_ibge_censo_2022__populacao_residente_municipio.sql b/models/br_ibge_censo_2022/br_ibge_censo_2022__populacao_residente_municipio.sql index f9d473a7..386f6a00 100644 --- a/models/br_ibge_censo_2022/br_ibge_censo_2022__populacao_residente_municipio.sql +++ b/models/br_ibge_censo_2022/br_ibge_censo_2022__populacao_residente_municipio.sql @@ -12,12 +12,77 @@ with safe_cast(forma_de_declaracao_da_idade as string) forma_declaracao_idade, safe_cast(sexo as string) sexo, safe_cast(idade as string) idade, + case + when idade = 'Menos de 1 mês' + then 0 + when regexp_contains(idade, r'[0-9]+ mês') + then safe_cast(regexp_extract(idade, r'[0-9]+ mês') as int64) / 12 + when regexp_contains(idade, r'[0-9]+ meses') + then safe_cast(regexp_extract(idade, r'([0-9])+ meses') as int64) / 12 + when regexp_contains(idade, r'[0-9]+ anos') + then cast(regexp_extract(idade, r'([0-9]+) anos') as int64) + when regexp_contains(idade, r'[0-9]+ ano') + then cast(regexp_extract(idade, r'([0-9]+) ano') as int64) + end as idade_num, safe_cast(populacao_residente_pessoas_ as int64) populacao_residente, from `basedosdados-staging.br_ibge_censo_2022_staging.populacao_residente_municipio` t ) -select t2.cod as id_municipio, ibge.* except (municipio, nome_municipio, sigla_uf) +select + t2.cod as id_municipio, + ibge.* except (municipio, nome_municipio, sigla_uf, idade_num, populacao_residente), + idade_num as idade_anos, + case + when idade_num between 0 and 4 + then '0 a 4 anos' + when idade_num between 5 and 9 + then '5 a 9 anos' + when idade_num between 10 and 14 + then '10 a 14 anos' + when idade_num between 15 and 19 + then '15 a 19 anos' + when idade_num between 20 and 24 + then '20 a 24 anos' + when idade_num between 25 and 29 + then '25 a 29 anos' + when idade_num between 30 and 34 + then '30 a 34 anos' + when idade_num between 35 and 39 + then '35 a 39 anos' + when idade_num between 40 and 44 + then '40 a 44 anos' + when idade_num between 45 and 49 + then '45 a 49 anos' + when idade_num between 50 and 54 + then '50 a 54 anos' + when idade_num between 55 and 59 + then '55 a 59 anos' + when idade_num between 60 and 64 + then '60 a 64 anos' + when idade_num between 65 and 69 + then '65 a 69 anos' + when idade_num between 70 and 74 + then '70 a 74 anos' + when idade_num between 75 and 79 + then '75 a 79 anos' + when idade_num between 80 and 84 + then '80 a 84 anos' + when idade_num between 85 and 89 + then '85 a 89 anos' + when idade_num between 90 and 94 + then '90 a 94 anos' + when idade_num between 95 and 99 + then '95 a 99 anos' + else '100 anos ou mais' + end as grupo_idade, + populacao_residente from ibge left join `basedosdados-dev.br_ibge_censo_2022_staging.auxiliary_table` t2 on ibge.municipio = t2.municipio +where + not ( + idade like '% a %' + or idade like '100 anos ou mais' + or idade like 'Menos de 1 ano' + ) diff --git a/models/br_mme_consumo_energia_eletrica/br_mme_consumo_energia_eletrica__uf.sql b/models/br_mme_consumo_energia_eletrica/br_mme_consumo_energia_eletrica__uf.sql new file mode 100644 index 00000000..620f7a43 --- /dev/null +++ b/models/br_mme_consumo_energia_eletrica/br_mme_consumo_energia_eletrica__uf.sql @@ -0,0 +1,19 @@ +{{ + config( + alias="uf", + schema="br_mme_consumo_energia_eletrica", + materialized="table", + ) +}} +select + safe_cast(ano as int64) as ano, + safe_cast(mes as int64) as mes, + safe_cast(sigla_uf as string) as sigla_uf, + safe_cast(tipo_consumo as string) as tipo_consumo, + case + when numero_consumidores = '0' + then null + else safe_cast(numero_consumidores as int64) + end as numero_consumidores, + safe_cast(consumo as int64) as consumo +from `basedosdados-staging.br_mme_consumo_energia_eletrica_staging.uf` as t diff --git a/models/br_mme_consumo_energia_eletrica/code/energia_eletrica.ipynb b/models/br_mme_consumo_energia_eletrica/code/energia_eletrica.ipynb new file mode 100755 index 00000000..0aff9a8c --- /dev/null +++ b/models/br_mme_consumo_energia_eletrica/code/energia_eletrica.ipynb @@ -0,0 +1,372 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def tratamento_consumo(sheet_name):\n", + " meses = {\n", + " \"JAN\": 1,\n", + " \"FEV\": 2,\n", + " \"MAR\": 3,\n", + " \"ABR\": 4,\n", + " \"MAI\": 5,\n", + " \"JUN\": 6,\n", + " \"JUL\": 7,\n", + " \"AGO\": 8,\n", + " \"SET\": 9,\n", + " \"OUT\": 10,\n", + " \"NOV\": 11,\n", + " \"DEZ\": 12\n", + " }\n", + " ufs = {'Acre':'AC', 'Alagoas':'AL', 'Amazonas':'AM', 'Amapá':'AP', 'Bahia':'BA', 'Ceará':'CE', 'Distrito Federal':'DF', 'Espírito Santo':'ES', 'Goiás':'GO', \n", + " 'Maranhão':'MA', 'Minas Gerais':'MG', 'Mato Grosso do Sul':'MS', 'Mato Grosso':'MT', 'Pará':'PA', 'Paraíba':'PB', 'Pernambuco':'PE', 'Piauí':'PI', \n", + " 'Paraná':'PR', 'Rio de Janeiro':'RJ', 'Rio Grande do Norte':'RN', 'Rondônia':'RO', 'Roraima':'RR', 'Rio Grande do Sul':'RS', \n", + " 'Santa Catarina':'SC', 'Sergipe':'SE', 'São Paulo':'SP', 'Tocantins':'TO'}\n", + " \n", + " df = pd.read_excel('/mnt/x/dados/consumo_energia_eletrica/dados.xls', sheet_name, skiprows=4, skipfooter=1, usecols='A:IG')\n", + " df_transposta = df.T\n", + " df_transposta.reset_index(inplace=True)\n", + " df_transposta.drop(df_transposta[['index', 2]], inplace=True, axis=1)\n", + " df_transposta.columns = df_transposta.iloc[0]\n", + " df_transposta = df_transposta[1:]\n", + " df_transposta.columns = ['ano', 'mes', 'Rondônia', 'Acre', 'Amazonas', 'Roraima', 'Pará',\n", + " 'Amapá', 'Tocantins', 'Maranhão', 'Piauí', 'Ceará',\n", + " 'Rio Grande do Norte', 'Paraíba', 'Pernambuco', 'Alagoas', 'Sergipe',\n", + " 'Bahia', 'Minas Gerais', 'Espírito Santo', 'Rio de Janeiro',\n", + " 'São Paulo', 'Paraná', 'Santa Catarina', 'Rio Grande do Sul',\n", + " 'Mato Grosso do Sul', 'Mato Grosso', 'Goiás', 'Distrito Federal']\n", + " df_transposta_melted = pd.melt(df_transposta, id_vars=[\"ano\", 'mes'], var_name=\"Estado\", value_name=\"Valor\")\n", + " df_transposta_melted['ano'].ffill(inplace=True)\n", + " df_transposta_melted['mes'] = df_transposta_melted['mes'].map(meses)\n", + "\n", + " df_transposta_melted.rename(columns={'Estado' : 'sigla_uf', 'Valor':'consumo'}, inplace=True) \n", + " df_transposta_melted['sigla_uf'] = df_transposta_melted['sigla_uf'].map(ufs)\n", + " return df_transposta_melted\n", + "\n", + "tipos_consumo = [\"Total\", \"Cativo\", \"Residencial\", \"Industrial\", \"Comercial\", \"Outros\"]\n", + "dfs_consumo = []\n", + "\n", + "# Realizando o tratamento para cada tipo de consumo\n", + "for i, tipo in enumerate(tipos_consumo):\n", + " if i < 6:\n", + " df = tratamento_consumo(i + 9)\n", + " df['tipo_consumo'] = tipo\n", + " dfs_consumo.append(df)\n", + "\n", + "# Concatenando todos os DataFrames\n", + "df_consumo = pd.concat(dfs_consumo)\n", + "df_consumo = df_consumo[['ano', 'mes', 'sigla_uf','tipo_consumo', 'consumo']]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(38880, 5)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_consumo.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def tratamento_consumidores(sheet_name):\n", + " meses = {\n", + " \"JAN\": 1,\n", + " \"FEV\": 2,\n", + " \"MAR\": 3,\n", + " \"ABR\": 4,\n", + " \"MAI\": 5,\n", + " \"JUN\": 6,\n", + " \"JUL\": 7,\n", + " \"AGO\": 8,\n", + " \"SET\": 9,\n", + " \"OUT\": 10,\n", + " \"NOV\": 11,\n", + " \"DEZ\": 12\n", + " }\n", + " ufs = {'Acre':'AC', 'Alagoas':'AL', 'Amazonas':'AM', 'Amapá':'AP', 'Bahia':'BA', 'Ceará':'CE', 'Distrito Federal':'DF', 'Espírito Santo':'ES', 'Goiás':'GO', \n", + " 'Maranhão':'MA', 'Minas Gerais':'MG', 'Mato Grosso do Sul':'MS', 'Mato Grosso':'MT', 'Pará':'PA', 'Paraíba':'PB', 'Pernambuco':'PE', 'Piauí':'PI', \n", + " 'Paraná':'PR', 'Rio de Janeiro':'RJ', 'Rio Grande do Norte':'RN', 'Rondônia':'RO', 'Roraima':'RR', 'Rio Grande do Sul':'RS', \n", + " 'Santa Catarina':'SC', 'Sergipe':'SE', 'São Paulo':'SP', 'Tocantins':'TO'}\n", + " \n", + " df = pd.read_excel('/mnt/x/dados/consumo_energia_eletrica/dados.xls', sheet_name, skiprows=4, skipfooter=1, usecols='A:IG')\n", + " df_transposta = df.T\n", + " df_transposta.reset_index(inplace=True)\n", + " df_transposta.drop(df_transposta[['index', 2]], inplace=True, axis=1)\n", + " df_transposta.columns = df_transposta.iloc[0]\n", + " df_transposta = df_transposta[1:]\n", + " df_transposta.columns = ['ano', 'mes', 'Rondônia', 'Acre', 'Amazonas', 'Roraima', 'Pará',\n", + " 'Amapá', 'Tocantins', 'Maranhão', 'Piauí', 'Ceará',\n", + " 'Rio Grande do Norte', 'Paraíba', 'Pernambuco', 'Alagoas', 'Sergipe',\n", + " 'Bahia', 'Minas Gerais', 'Espírito Santo', 'Rio de Janeiro',\n", + " 'São Paulo', 'Paraná', 'Santa Catarina', 'Rio Grande do Sul',\n", + " 'Mato Grosso do Sul', 'Mato Grosso', 'Goiás', 'Distrito Federal']\n", + " df_transposta_melted = pd.melt(df_transposta, id_vars=[\"ano\", 'mes'], var_name=\"Estado\", value_name=\"Valor\")\n", + " df_transposta_melted['ano'].ffill(inplace=True)\n", + " df_transposta_melted['mes'] = df_transposta_melted['mes'].map(meses)\n", + "\n", + " df_transposta_melted.rename(columns={'Estado' : 'sigla_uf', 'Valor':'numero_consumidores'}, inplace=True) \n", + " df_transposta_melted['sigla_uf'] = df_transposta_melted['sigla_uf'].map(ufs)\n", + " return df_transposta_melted\n", + "\n", + "tipos_consumidores = ['Residencial', 'Industrial', 'Comercial', 'Outros']\n", + "dfs_consumidores = []\n", + "\n", + "# Realizando o tratamento para cada tipo de consumidores\n", + "for i, tipo in enumerate(tipos_consumidores):\n", + " if i < 6:\n", + " df = tratamento_consumidores(i + 15)\n", + " df['tipo_consumo'] = tipo\n", + " dfs_consumidores.append(df)\n", + "\n", + "# Concatenando todos os DataFrames\n", + "df_consumidores = pd.concat(dfs_consumidores)\n", + "df_consumidores = df_consumidores[['ano', 'mes', 'sigla_uf','tipo_consumo', 'numero_consumidores']]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "df_total = pd.merge(df_consumo, df_consumidores, how= 'left', on=['ano', 'mes', 'sigla_uf', 'tipo_consumo'])\n", + "df_total = df_total[['ano', 'mes', 'sigla_uf', 'tipo_consumo', 'numero_consumidores', 'consumo']]\n", + "df_total['consumo'] = df_total['consumo'].astype(int)\n", + "df_total['numero_consumidores'] = df_total['numero_consumidores'].fillna(0).astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | ano | \n", + "mes | \n", + "sigla_uf | \n", + "tipo_consumo | \n", + "numero_consumidores | \n", + "consumo | \n", + "
---|---|---|---|---|---|---|
15840 | \n", + "2004 | \n", + "1 | \n", + "PE | \n", + "Residencial | \n", + "2001833 | \n", + "228466 | \n", + "
15841 | \n", + "2004 | \n", + "2 | \n", + "PE | \n", + "Residencial | \n", + "2004750 | \n", + "217764 | \n", + "
15842 | \n", + "2004 | \n", + "3 | \n", + "PE | \n", + "Residencial | \n", + "2018407 | \n", + "225066 | \n", + "
15843 | \n", + "2004 | \n", + "4 | \n", + "PE | \n", + "Residencial | \n", + "2026995 | \n", + "232170 | \n", + "
15844 | \n", + "2004 | \n", + "5 | \n", + "PE | \n", + "Residencial | \n", + "1972048 | \n", + "210365 | \n", + "
15845 | \n", + "2004 | \n", + "6 | \n", + "PE | \n", + "Residencial | \n", + "1963599 | \n", + "198030 | \n", + "
15846 | \n", + "2004 | \n", + "7 | \n", + "PE | \n", + "Residencial | \n", + "1954839 | \n", + "192699 | \n", + "
15847 | \n", + "2004 | \n", + "8 | \n", + "PE | \n", + "Residencial | \n", + "1949125 | \n", + "189991 | \n", + "
15848 | \n", + "2004 | \n", + "9 | \n", + "PE | \n", + "Residencial | \n", + "1989788 | \n", + "224208 | \n", + "
15849 | \n", + "2004 | \n", + "10 | \n", + "PE | \n", + "Residencial | \n", + "2022621 | \n", + "212547 | \n", + "
15850 | \n", + "2004 | \n", + "11 | \n", + "PE | \n", + "Residencial | \n", + "2046330 | \n", + "226619 | \n", + "
15851 | \n", + "2004 | \n", + "12 | \n", + "PE | \n", + "Residencial | \n", + "2040413 | \n", + "240852 | \n", + "