From d13de3417d159de34bf32b5ca9619d8d96cb9f49 Mon Sep 17 00:00:00 2001 From: Pedro Castro Date: Mon, 16 Sep 2024 17:10:32 -0300 Subject: [PATCH] fix: `br_inep_saeb.aluno_ef_9ano` (#773) --- .../br_inep_saeb__aluno_ef_9ano.sql | 1 - .../code/fix_microdados_aluno_ef_9ano_2021.py | 227 ++++++++++++++++++ poetry.lock | 43 +++- pyproject.toml | 1 + 4 files changed, 270 insertions(+), 2 deletions(-) create mode 100644 models/br_inep_saeb/code/fix_microdados_aluno_ef_9ano_2021.py diff --git a/models/br_inep_saeb/br_inep_saeb__aluno_ef_9ano.sql b/models/br_inep_saeb/br_inep_saeb__aluno_ef_9ano.sql index fa9578c8..efc96599 100644 --- a/models/br_inep_saeb/br_inep_saeb__aluno_ef_9ano.sql +++ b/models/br_inep_saeb/br_inep_saeb__aluno_ef_9ano.sql @@ -13,7 +13,6 @@ ) }} - select safe_cast(ano as int64) ano, safe_cast(sigla_uf as string) sigla_uf, diff --git a/models/br_inep_saeb/code/fix_microdados_aluno_ef_9ano_2021.py b/models/br_inep_saeb/code/fix_microdados_aluno_ef_9ano_2021.py new file mode 100644 index 00000000..be89e273 --- /dev/null +++ b/models/br_inep_saeb/code/fix_microdados_aluno_ef_9ano_2021.py @@ -0,0 +1,227 @@ +import polars as pl +import os +import basedosdados as bd +from databasers_utils import TableArchitecture +import zipfile + +ROOT = os.path.join("models", "br_inep_saeb") +INPUT = os.path.join(ROOT, "input") +TMP = os.path.join(ROOT, "tmp") +OUTPUT = os.path.join(ROOT, "output", "aluno_ef_9ano") + +ZIP_URL = "https://download.inep.gov.br/microdados/microdados_saeb_2021_ensino_fundamental_e_medio.zip" + +ZIP_FILE = os.path.join(INPUT, os.path.basename(ZIP_URL)) + +os.system( + f"wget {ZIP_URL} --no-check-certificate -o {INPUT}/{os.path.basename(ZIP_URL)}" +) + +with zipfile.ZipFile(ZIP_FILE) as z: + print(z.namelist()) + +with zipfile.ZipFile(ZIP_FILE) as z: + z.extract("DADOS/TS_ALUNO_9EF.csv", TMP) + +# Apenas Roraima (RR) +df = pl.read_csv(os.path.join(TMP, "DADOS/TS_ALUNO_9EF.csv"), separator=";").filter( + pl.col("ID_UF") == 14 +) + +csv_columns = df.columns + +arch = TableArchitecture( + "br_inep_saeb", + { + "aluno_ef_9ano": "https://docs.google.com/spreadsheets/d/1KLkvX8z9AKIe4iM5EeahVBCVUjikNvHJWfT-qTjM2IU/edit?gid=0#gid=0", + }, +) + +tables_arch = arch.tables() + +arch_ef_9ano = tables_arch["aluno_ef_9ano"] + +cols_disciplina = [ + i + for i in csv_columns + if i.endswith("LP") or i.endswith("MT") or i.endswith("CH") or i.endswith("CN") +] + +index_cols = [ + "ID_SAEB", + "ID_REGIAO", + "ID_UF", + "ID_MUNICIPIO", + "ID_AREA", + "ID_ESCOLA", + "IN_PUBLICA", + "ID_LOCALIZACAO", + "ID_TURMA", + "ID_SERIE", + "ID_ALUNO", + "IN_SITUACAO_CENSO", + "IN_AMOSTRA", + "ESTRATO", + "IN_PREENCHIMENTO_QUESTIONARIO", + "IN_INSE", + "INSE_ALUNO", + "NU_TIPO_NIVEL_INSE", + "PESO_ALUNO_INSE", +] + +on = [ + "PROFICIENCIA_LP_SAEB", + "PROFICIENCIA_MT_SAEB", + "PROFICIENCIA_CH_SAEB", + "PROFICIENCIA_CN_SAEB", + "ERRO_PADRAO_LP_SAEB", + "ERRO_PADRAO_MT_SAEB", + "ERRO_PADRAO_CH_SAEB", + "ERRO_PADRAO_CN_SAEB", + *cols_disciplina, +] + + +def find_disc(value: str) -> str: + """ + Returns two characters identifying the subject from a variable + + Parameters + ---------- + value + Variable name + + Examples + -------- + >>> find_disc("PROFICIENCIA_CH_SAEB") + >>> "CH" + """ + last_two_char = value[-2:] + if last_two_char not in ["LP", "MT", "CH", "CN"]: + return value.split("_")[-2] + return last_two_char + + +def renames_variables(value: tuple[str, str]) -> str: + """ + Rename variables using subject + + Parameters + ---------- + value: + Tuple (two values), lhs is variable name and rhs is subject + + Examples + -------- + >>> renames_variables(("PROFICIENCIA_CH_SAEB", "CH")) + >>> "PROFICIENCIA__SAEB" + """ + variable, disc = value + parts = variable.split("_") + + if disc in parts: + return "_".join([i for i in parts if i not in disc]) + + return variable + + +other_index_cols = [ + i + for i in csv_columns + if i.startswith("TX_RESP") and i.split("_")[-1] not in ["LP", "MT", "CH", "CN"] +] + + +def wide_to_long(df: pl.DataFrame) -> pl.DataFrame: + """ + Convert a DataFrame from wide to long format + """ + return ( + df.unpivot(on=on, index=[*index_cols, *other_index_cols]) + .with_columns( + pl.col("variable") + .map_elements(lambda v: find_disc(v), return_dtype=pl.String) + .alias("disciplina"), + ) + .with_columns( + pl.struct(["variable", "disciplina"]).map_elements( + lambda cols: renames_variables((cols["variable"], cols["disciplina"])), + return_dtype=pl.String, + ) + ) + .pivot( + on="variable", + values="value", + index=[*index_cols, *other_index_cols, "disciplina"], + ) + ) + + +manual_renames = { + "PROFICIENCIA_SAEB": "proficiencia_saeb", + "ERRO_PADRAO_SAEB": "erro_padrao_saeb", + "IN_PREENCHIMENTO": "preenchimento_caderno", + "IN_PRESENCA": "presenca", + "ID_CADERNO": "caderno", + "ID_BLOCO_1": "bloco_1", + "ID_BLOCO_2": "bloco_2", + "ID_BLOCO_3": "bloco_3", + "NU_BLOCO_1_ABERTA": "bloco_1_aberto", + "NU_BLOCO_2_ABERTA": "bloco_2_aberto", + "TX_RESP_BLOCO1": "respostas_bloco_1", + "TX_RESP_BLOCO2": "respostas_bloco_2", + "TX_RESP_BLOCO3": "respostas_bloco_3", + "CO_CONCEITO_Q1": "conceito_q1", + "CO_CONCEITO_Q2": "conceito_q2", + "IN_PROFICIENCIA": "indicador_proficiencia", + "PESO_ALUNO": "peso_aluno", + "PROFICIENCIA": "proficiencia", + "ERRO_PADRAO": "erro_padrao", +} + +tb_aluno_ef_9ano = bd.Table("br_inep_saeb", table_id="aluno_ef_9ano") + +bq_cols = tb_aluno_ef_9ano._get_columns_from_bq() + +cols_dict = dict([(i["name"], i["type"]) for i in bq_cols["columns"]]) + +common_renames = { + i["original_name_2021"]: i["name"] + for i in arch_ef_9ano.loc[arch_ef_9ano["original_name_2021"] != ""][ + ["name", "original_name_2021"] + ].to_dict("records") + if i["original_name_2021"] in csv_columns +} + +df = ( + wide_to_long(df) + .rename({**common_renames, **manual_renames}) + .with_columns(pl.lit("RR").alias("sigla_uf")) + .drop(["ID_UF"]) +) + + +empty_cols_to_add = [i for i in cols_dict.keys() if i not in df.columns] + +len(empty_cols_to_add) + +df = ( + df.with_columns([pl.lit(None).alias(col) for col in empty_cols_to_add]) + .with_columns([pl.col(col_name).cast(pl.String) for col_name in cols_dict.keys()]) + .select(*cols_dict.keys()) + .filter(pl.col("disciplina") == "MT") +) + +assert len(df.columns) == len(bq_cols["columns"]) + +PARTITION_DIR = os.path.join(OUTPUT, "ano=2021", "sigla_uf=RR") + +os.makedirs(PARTITION_DIR, exist_ok=True) + +df.write_csv(os.path.join(PARTITION_DIR, "microdados_mt.csv")) + +tb_aluno_ef_9ano.create( + OUTPUT, + if_table_exists="replace", + if_storage_data_exists="replace", +) diff --git a/poetry.lock b/poetry.lock index add9d438..9d41df00 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1863,6 +1863,47 @@ files = [ docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.1)", "sphinx-autodoc-typehints (>=1.24)"] test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)"] +[[package]] +name = "polars" +version = "1.7.0" +description = "Blazingly fast DataFrame library" +optional = false +python-versions = ">=3.8" +files = [ + {file = "polars-1.7.0-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:515201ae296a3ef770102dab196f207581b617c7a053cf200aff57fdca03001f"}, + {file = "polars-1.7.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:5c7b03a60b3490f7ca17e0a69208b29bea1a70f62b394443035f196c751ca126"}, + {file = "polars-1.7.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56042ca1e35c2e6a08e2f37d981a46af685e273cde08944328921a22ed5e1926"}, + {file = "polars-1.7.0-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:bdb11cc9807d3886a8ea516a9d829fb9d8570d11b80be9310f825d1331011dbd"}, + {file = "polars-1.7.0-cp38-abi3-win_amd64.whl", hash = "sha256:3d0fe4b8505c918114bbef6c8f8bd7a245ce3b7cd59f3c9fba2c9cd1f24606f9"}, + {file = "polars-1.7.0.tar.gz", hash = "sha256:cd90892c8ed0ee755fdf0a0ebc08489c6f0094eaa7eed5edb48956c3e431ef8c"}, +] + +[package.extras] +adbc = ["adbc-driver-manager[dbapi]", "adbc-driver-sqlite[dbapi]"] +all = ["polars[async,cloudpickle,database,deltalake,excel,fsspec,graph,iceberg,numpy,pandas,plot,pyarrow,pydantic,style,timezone]"] +async = ["gevent"] +calamine = ["fastexcel (>=0.9)"] +cloudpickle = ["cloudpickle"] +connectorx = ["connectorx (>=0.3.2)"] +database = ["nest-asyncio", "polars[adbc,connectorx,sqlalchemy]"] +deltalake = ["deltalake (>=0.15.0)"] +excel = ["polars[calamine,openpyxl,xlsx2csv,xlsxwriter]"] +fsspec = ["fsspec"] +gpu = ["cudf-polars-cu12"] +graph = ["matplotlib"] +iceberg = ["pyiceberg (>=0.5.0)"] +numpy = ["numpy (>=1.16.0)"] +openpyxl = ["openpyxl (>=3.0.0)"] +pandas = ["pandas", "polars[pyarrow]"] +plot = ["altair (>=5.4.0)"] +pyarrow = ["pyarrow (>=7.0.0)"] +pydantic = ["pydantic"] +sqlalchemy = ["polars[pandas]", "sqlalchemy"] +style = ["great-tables (>=0.8.0)"] +timezone = ["backports-zoneinfo", "tzdata"] +xlsx2csv = ["xlsx2csv (>=0.8.0)"] +xlsxwriter = ["xlsxwriter"] + [[package]] name = "pre-commit" version = "3.8.0" @@ -2872,4 +2913,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = "~3.9" -content-hash = "449d554196de70d2bc05c8e9a343e3cfd2ccbadaf913590818f0ed8d27e8fef3" +content-hash = "98e3b8ca5ac1d40be1d09241e907e7e1bc99e54cf66e550091591fad6dabbdb3" diff --git a/pyproject.toml b/pyproject.toml index 03e3ca5f..cc7e8ec0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,3 +29,4 @@ markdown = "^3.3.6" basedosdados = {version = "2.0.0b16", extras = ["upload"]} gql = "^3.5.0" databasers-utils = {git = "https://github.com/basedosdados/databasers-utils.git#main"} +polars = "^1.7.0"