Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[dbt] br_inep_enem #810

Merged
merged 2 commits into from
Nov 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions models/br_inep_enem/br_inep_enem__dicionario.sql
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
{{ config(alias="dicionario", schema="br_inep_enem") }}

select
safe_cast(id_tabela as string) id_tabela,
safe_cast(nome_coluna as string) nome_coluna,
Expand Down
2 changes: 1 addition & 1 deletion models/br_inep_enem/br_inep_enem__microdados.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
partition_by={
"field": "ano",
"data_type": "int64",
"range": {"start": 1998, "end": 2022, "interval": 1},
"range": {"start": 1998, "end": 2023, "interval": 1},
},
labels={"project_id": "basedosdados", "tema": "educacao"},
)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{{ config(alias="questionario_socioeconomico_2023", schema="br_inep_enem") }}

select
safe_cast(id_inscricao as string) id_inscricao,
safe_cast(q001 as string) q001,
safe_cast(q002 as string) q002,
safe_cast(q003 as string) q003,
safe_cast(q004 as string) q004,
safe_cast(q005 as string) q005,
safe_cast(q006 as string) q006,
safe_cast(q007 as string) q007,
safe_cast(q008 as string) q008,
safe_cast(q009 as string) q009,
safe_cast(q010 as string) q010,
safe_cast(q011 as string) q011,
safe_cast(q012 as string) q012,
safe_cast(q013 as string) q013,
safe_cast(q014 as string) q014,
safe_cast(q015 as string) q015,
safe_cast(q016 as string) q016,
safe_cast(q017 as string) q017,
safe_cast(q018 as string) q018,
safe_cast(q019 as string) q019,
safe_cast(q020 as string) q020,
safe_cast(q021 as string) q021,
safe_cast(q022 as string) q022,
safe_cast(q023 as string) q023,
safe_cast(q024 as string) q024,
safe_cast(q025 as string) q025
from `basedosdados-staging.br_inep_enem_staging.questionario_socioeconomico_2023` as t
308 changes: 308 additions & 0 deletions models/br_inep_enem/code/microdados.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,308 @@
import pandas as pd
import numpy as np
import os
from io import StringIO
import requests
import gc
import warnings
import re
from datetime import datetime
from os import getenv, walk
from os.path import join
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
from uuid import uuid4
import zipfile
import basedosdados as bd

import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")


def to_partitions(
data: pd.DataFrame,
partition_columns: List[str],
savepath: str,
file_type: str = "csv",
):
"""Save data in to hive patitions schema, given a dataframe and a list of partition columns.
Args:
data (pandas.core.frame.DataFrame): Dataframe to be partitioned.
partition_columns (list): List of columns to be used as partitions.
savepath (str, pathlib.PosixPath): folder path to save the partitions.
file_type (str): default to csv. Accepts parquet.
Exemple:
data = {
"ano": [2020, 2021, 2020, 2021, 2020, 2021, 2021,2025],
"mes": [1, 2, 3, 4, 5, 6, 6,9],
"sigla_uf": ["SP", "SP", "RJ", "RJ", "PR", "PR", "PR","PR"],
"dado": ["a", "b", "c", "d", "e", "f", "g",'h'],
}
to_partitions(
data=pd.DataFrame(data),
partition_columns=['ano','mes','sigla_uf'],
savepath='partitions/',
)
"""

if isinstance(data, (pd.core.frame.DataFrame)):
savepath = Path(savepath)
# create unique combinations between partition columns
unique_combinations = (
data[partition_columns]
# .astype(str)
.drop_duplicates(subset=partition_columns).to_dict(orient="records")
)

for filter_combination in unique_combinations:
patitions_values = [
f"{partition}={value}"
for partition, value in filter_combination.items()
]

# get filtered data
df_filter = data.loc[
data[filter_combination.keys()]
.isin(filter_combination.values())
.all(axis=1),
:,
]
df_filter = df_filter.drop(columns=partition_columns)

# create folder tree
filter_save_path = Path(savepath / "/".join(patitions_values))
filter_save_path.mkdir(parents=True, exist_ok=True)

if file_type == "csv":
# append data to csv
file_filter_save_path = Path(filter_save_path) / "data.csv"
df_filter.to_csv(
file_filter_save_path,
sep=",",
encoding="utf-8",
na_rep="",
index=False,
mode="a",
header=not file_filter_save_path.exists(),
)
elif file_type == "parquet":
# append data to parquet
file_filter_save_path = Path(filter_save_path) / "data.parquet"
df_filter.to_parquet(
file_filter_save_path, index=False, compression="gzip"
)
else:
raise BaseException("Data need to be a pandas DataFrame")


valor = 0
def read_csv_enem():
global valor
for df in pd.read_csv(
"/home/tricktx/dados/br_inep_enem/Microdados ENEM 2023/DADOS/MICRODADOS_ENEM_2023.csv",
sep=";",
encoding="latin1",
chunksize=100000):
valor = valor + 1
print(valor)
RENAME = {
"NU_INSCRICAO": "id_inscricao",
"NU_ANO": "ano",
"TP_FAIXA_ETARIA": "faixa_etaria",
"TP_SEXO": "sexo",
"TP_ESTADO_CIVIL": "estado_civil",
"TP_COR_RACA": "cor_raca",
"TP_NACIONALIDADE": "nacionalidade",
"TP_ST_CONCLUSAO": "situacao_conclusao",
"TP_ANO_CONCLUIU": "ano_conclusao",
"TP_ESCOLA": "tipo_escola",
"TP_ENSINO": "ensino",
"IN_TREINEIRO": "indicador_treineiro",
"CO_MUNICIPIO_ESC": "id_municipio_escola",
"SG_UF_ESC": "sigla_uf_escola",
"TP_DEPENDENCIA_ADM_ESC": "dependencia_administrativa_escola",
"TP_LOCALIZACAO_ESC": "localizacao_escola",
"TP_SIT_FUNC_ESC": "situacao_funcionamento_escola",
"CO_MUNICIPIO_PROVA": "id_municipio_prova",
"SG_UF_PROVA": "sigla_uf_prova",
"TP_PRESENCA_CN": "presenca_ciencias_natureza",
"TP_PRESENCA_CH": "presenca_ciencias_humanas",
"TP_PRESENCA_LC": "presenca_linguagens_codigos",
"TP_PRESENCA_MT": "presenca_matematica",
"CO_PROVA_CN": "tipo_prova_ciencias_natureza",
"CO_PROVA_CH": "tipo_prova_ciencias_humanas",
"CO_PROVA_LC": "tipo_prova_linguagens_codigos",
"CO_PROVA_MT": "tipo_prova_matematica",
"NU_NOTA_CN": "nota_ciencias_natureza",
"NU_NOTA_CH": "nota_ciencias_humanas",
"NU_NOTA_LC": "nota_linguagens_codigos",
"NU_NOTA_MT": "nota_matematica",
"TX_RESPOSTAS_CN": "respostas_ciencias_natureza",
"TX_RESPOSTAS_CH": "respostas_ciencias_humanas",
"TX_RESPOSTAS_LC": "respostas_linguagens_codigos",
"TX_RESPOSTAS_MT": "respostas_matematica",
"TX_GABARITO_CN": "gabarito_ciencias_natureza",
"TX_GABARITO_CH": "gabarito_ciencias_humanas",
"TX_GABARITO_LC": "gabarito_linguagens_codigos",
"TX_GABARITO_MT": "gabarito_matematica",
"TP_LINGUA": "lingua_estrangeira",
"TP_STATUS_REDACAO": "presenca_redacao",
"NU_NOTA_COMP1": "nota_redacao_competencia_1",
"NU_NOTA_COMP2": "nota_redacao_competencia_2",
"NU_NOTA_COMP3": "nota_redacao_competencia_3",
"NU_NOTA_COMP4": "nota_redacao_competencia_4",
"NU_NOTA_COMP5": "nota_redacao_competencia_5",
"NU_NOTA_REDACAO": "nota_redacao",
}
df.rename(columns=RENAME, inplace=True)

lista = [
"id_inscricao",
"ano",
"faixa_etaria",
"sexo",
"id_municipio_residencia",
"sigla_uf_residencia",
"estado_civil",
"cor_raca",
"nacionalidade",
"situacao_conclusao",
"ano_conclusao",
"tipo_escola",
"ensino",
"indicador_treineiro",
"id_municipio_escola",
"sigla_uf_escola",
"dependencia_administrativa_escola",
"localizacao_escola",
"situacao_funcionamento_escola",
"indicador_certificado",
"nome_certificadora",
"sigla_uf_certificadora",
"id_municipio_prova",
"sigla_uf_prova",
"presenca_objetiva",
"tipo_prova_objetiva",
"nota_objetiva_competencia_1",
"nota_objetiva_competencia_2",
"nota_objetiva_competencia_3",
"nota_objetiva_competencia_4",
"nota_objetiva_competencia_5",
"nota_objetiva",
"respostas_objetiva",
"gabarito_objetiva",
"presenca_ciencias_natureza",
"presenca_ciencias_humanas",
"presenca_linguagens_codigos",
"presenca_matematica",
"tipo_prova_ciencias_natureza",
"tipo_prova_ciencias_humanas",
"tipo_prova_linguagens_codigos",
"tipo_prova_matematica",
"nota_ciencias_natureza",
"nota_ciencias_humanas",
"nota_linguagens_codigos",
"nota_matematica",
"respostas_ciencias_natureza",
"respostas_ciencias_humanas",
"respostas_linguagens_codigos",
"respostas_matematica",
"gabarito_ciencias_natureza",
"gabarito_ciencias_humanas",
"gabarito_linguagens_codigos",
"gabarito_matematica",
"lingua_estrangeira",
"presenca_redacao",
"nota_redacao_competencia_1",
"nota_redacao_competencia_2",
"nota_redacao_competencia_3",
"nota_redacao_competencia_4",
"nota_redacao_competencia_5",
"nota_redacao",
"indicador_questionario_socioeconomico",
]
for col in lista:
if col not in df_lista.columns:
df_lista[col] = str(np.nan)

for x in df_lista.columns:
df_lista[x] = df_lista[x].apply(
lambda x: str(x).replace(".0", "").replace("nan", "")
)

df_lista = df_lista[
[
"ano",
"id_inscricao",
"faixa_etaria",
"sexo",
"id_municipio_residencia",
"sigla_uf_residencia",
"estado_civil",
"cor_raca",
"nacionalidade",
"situacao_conclusao",
"ano_conclusao",
"tipo_escola",
"ensino",
"indicador_treineiro",
"id_municipio_escola",
"sigla_uf_escola",
"dependencia_administrativa_escola",
"localizacao_escola",
"situacao_funcionamento_escola",
"indicador_certificado",
"nome_certificadora",
"sigla_uf_certificadora",
"id_municipio_prova",
"sigla_uf_prova",
"presenca_objetiva",
"tipo_prova_objetiva",
"nota_objetiva_competencia_1",
"nota_objetiva_competencia_2",
"nota_objetiva_competencia_3",
"nota_objetiva_competencia_4",
"nota_objetiva_competencia_5",
"nota_objetiva",
"respostas_objetiva",
"gabarito_objetiva",
"presenca_ciencias_natureza",
"presenca_ciencias_humanas",
"presenca_linguagens_codigos",
"presenca_matematica",
"tipo_prova_ciencias_natureza",
"tipo_prova_ciencias_humanas",
"tipo_prova_linguagens_codigos",
"tipo_prova_matematica",
"nota_ciencias_natureza",
"nota_ciencias_humanas",
"nota_linguagens_codigos",
"nota_matematica",
"respostas_ciencias_natureza",
"respostas_ciencias_humanas",
"respostas_linguagens_codigos",
"respostas_matematica",
"gabarito_ciencias_natureza",
"gabarito_ciencias_humanas",
"gabarito_linguagens_codigos",
"gabarito_matematica",
"lingua_estrangeira",
"presenca_redacao",
"nota_redacao_competencia_1",
"nota_redacao_competencia_2",
"nota_redacao_competencia_3",
"nota_redacao_competencia_4",
"nota_redacao_competencia_5",
"nota_redacao",
"indicador_questionario_socioeconomico",
]
]

to_partitions(
data=df_lista,
partition_columns=["ano"],
savepath="/home/tricktx/dados/br_inep_enem/main/",
file_type="csv")

read_csv_enem()
Loading
Loading