From bbcc0d0a85ea68388e0ae51482f1adb7c52630ed Mon Sep 17 00:00:00 2001 From: Ricardo Dahis Date: Thu, 21 Sep 2023 11:26:26 -0400 Subject: [PATCH] upload MiDES procurement code --- bases/world_wb_mides/code/licitacao.ipynb | 1271 ++++++++++++++++ .../world_wb_mides/code/licitacao_item.ipynb | 1274 +++++++++++++++++ .../code/licitacao_participante.ipynb | 1012 +++++++++++++ 3 files changed, 3557 insertions(+) create mode 100644 bases/world_wb_mides/code/licitacao.ipynb create mode 100644 bases/world_wb_mides/code/licitacao_item.ipynb create mode 100644 bases/world_wb_mides/code/licitacao_participante.ipynb diff --git a/bases/world_wb_mides/code/licitacao.ipynb b/bases/world_wb_mides/code/licitacao.ipynb new file mode 100644 index 000000000..77dbf4183 --- /dev/null +++ b/bases/world_wb_mides/code/licitacao.ipynb @@ -0,0 +1,1271 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "**Preamble**\n", + "* This code cleans the raw public procurement data obtained from the State Audit Courts (TCEs) of the following states: CE, PE, MG, PR, RS and PB.\n", + "* The final output of this code is the tender table (*licitacao*), available at [basedosdados](https://basedosdados.org/dataset/d3874769-bcbd-4ece-a38a-157ba1021514?table=14c5d05b-9830-4710-b7ac-7e0ca1bf9d8b).\n", + "* Made by: Nathalia Sales\n", + "\n", + "\n", + "\n" + ], + "metadata": { + "id": "zWLMuEwH2UKg" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zQzslM-4eW1K" + }, + "outputs": [], + "source": [ + "# Connect to google drive\n", + "\n", + "from google.colab import drive\n", + "drive.mount('/content/gdrive')\n", + "\n", + "# Necessary packages\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "import glob\n", + "from zipfile import ZipFile\n", + "from datetime import datetime\n", + "\n", + "# Display options\n", + "\n", + "pd.set_option('display.max_columns', None)\n", + "pd.options.display.float_format = '{:.2f}'.format\n", + "\n", + "# Set directory\n", + "\n", + "path = '/content/gdrive/MyDrive/ComprasPublicas_Brasil'\n", + "\n", + "# Open some auxiliary files\n", + "\n", + "municipio = pd.read_csv(os.path.join(path, \"auxiliary_files/municipio.csv\"), encoding='utf-8', dtype=str)\n", + "\n", + "id_tce = pd.read_csv(os.path.join(path, \"input/PE/municipios.csv\"), encoding='latin-1',dtype=str,\n", + " usecols = ['CODIGOIBGE','CODIGO','UNIDADEFEDERATIVA'])\n", + "\n", + "id_tce.rename(columns={'CODIGOIBGE':'id_municipio', 'CODIGO':'id_municipio_tce', 'UNIDADEFEDERATIVA':'sigla_uf'}, inplace=True)\n", + "\n", + "# Merge both\n", + "municipio = pd.merge(municipio, id_tce, how='left', left_on=['id_municipio', 'sigla_uf'], right_on=['id_municipio', 'sigla_uf'])\n", + "\n", + "ug_id = pd.read_csv(os.path.join(path, \"auxiliary_files/ug_id_mg.csv\"), sep=',', dtype=str) # MG\n", + "\n", + "orgao_municipio = pd.read_csv(os.path.join(path, \"input/RS/orgaos_auditados_rs.csv\"), encoding='utf-8',dtype=str,\n", + " usecols=['CD_MUNICIPIO_IBGE', 'CD_ORGAO']) # RS\n", + "\n", + "# Create a list of UFs\n", + "ufs = municipio['sigla_uf'].unique().tolist()\n", + "\n", + "# Set columns order\n", + "\n", + "ordem = ['ano','mes','sigla_uf','id_municipio','orgao','id_unidade_gestora','id_licitacao_bd','id_licitacao','id_dispensa','ano_processo','data_abertura','data_edital',\n", + " 'data_homologacao','data_publicacao_dispensa','descricao_objeto','natureza_objeto', 'modalidade','natureza_processo','tipo','forma_pagamento',\n", + " 'valor_orcamento','valor','valor_corrigido','situacao','estagio','preferencia_micro_pequena','exclusiva_micro_pequena','contratacao','quantidade_convidados',\n", + " 'tipo_cadastro','carona','covid_19']" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1XK1HKfylqDd" + }, + "source": [ + "## CE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KGgAmh-Plrd2" + }, + "outputs": [], + "source": [ + "#CE\n", + "\n", + "# Get a list of all CSV files\n", + "\n", + "all_files = glob.glob(os.path.join(path,\"input/CE/Licitações/licitacoes_*.csv\"))\n", + "\n", + "# Initialize an empty list and loop through each CSV file\n", + "\n", + "all_df_ce = []\n", + "for f in all_files:\n", + " df1 = pd.read_csv(f, sep=';', dtype=str, encoding='latin-1')\n", + " df1['arquivo'] = f.split('/')[-1]\n", + " all_df_ce.append(df1)\n", + "\n", + "# Concatenate all DataFrames in the list into a single DataFrame\n", + "\n", + "ce = pd.concat(all_df_ce, ignore_index=True, sort=True)\n", + "\n", + "# Extract the year from the file name\n", + "\n", + "ce['ano'] = ce['arquivo'].str[11:15]\n", + "\n", + "# List original variables to drop\n", + "\n", + "ce_drop = ['cpf_gestor','cpf_responsavel_homologacao','cpf_responsavel_juridico',\n", + " 'data_criacao_comissao','descricao1_justificativa_preco',\n", + " 'descricao1_motivo_fornecedor','descricao2_justificativa_preco',\n", + " 'descricao2_motivo_fornecedor','hora_licitacao','arquivo',\n", + " 'nome_responsavel_homologacao','nome_responsavel_juridico','numero_comissao',\n", + " 'valor_limite_superior','descricao2_objeto_licitacao','nome_orgao_ata ',\n", + " 'data_realizacao_autuacao_licitacao']\n", + "\n", + "# Dictionary\n", + "\n", + "ce_rename = {'numero_licitacao':'id_licitacao','descricao1_objeto_licitacao':'descricao_objeto',\n", + " 'modalidade_licitacao':'modalidade','tipo_licitacao':'tipo',\n", + " 'valor_orcado_estimado':'valor_orcamento',\n", + " 'data_realizacao_licitacao':'data_abertura',\n", + " 'data_emissao_edital':'data_edital'}\n", + "\n", + "modalidade = {'4':'11','5':'7','6':'4','7':'12'}\n", + "\n", + "tipo = {'6':'4','7':'5','8':'2','9':'13'}\n", + "\n", + "# Drop and rename\n", + "\n", + "ce.drop(ce_drop, axis=1, inplace=True)\n", + "ce.rename(ce_rename, axis=1, inplace=True)\n", + "\n", + "# Read a CSV file containing municipality information\n", + "\n", + "id_mun = pd.read_csv(os.path.join(path, \"municipios.csv\"), sep=';', dtype=str, encoding='latin-1',\n", + " usecols=['geoibgeId','codigo_municipio'])\n", + "\n", + "id_mun.rename({'geoibgeId':'id_municipio'}, axis=1, inplace=True)\n", + "\n", + "# Merge on codigo_municipio to get id_municipio (IBGE code)\n", + "\n", + "ce = pd.merge(ce, id_mun, how='left', left_on='codigo_municipio', right_on='codigo_municipio')\n", + "\n", + "# Replace by dictionary\n", + "\n", + "ce['modalidade'] = ce['modalidade'].replace(modalidade, regex=True)\n", + "ce['tipo'] = ce['tipo'].replace(tipo, regex=True)\n", + "\n", + "# Adjustments in type of purchase: dispensa and inexigibilidade\n", + "# Type does not apply but we know from other variable that it is tender waiver or non-requeriment\n", + "# 9 - modalidade não se aplica, D - dispensa, I - inexigibilidade\n", + "\n", + "ce['modalidade'] = np.where((ce['modalidade'] == \"9\") & (ce['modalidade_processo_administrativo'] == \"D\"), '8', ce['modalidade'])\n", + "ce['modalidade'] = np.where((ce['modalidade'] == \"9\") & (ce['modalidade_processo_administrativo'] == \"I\"), '10', ce['modalidade'])\n", + "\n", + "# Type does not apply and we don't know what it is - set as missing\n", + "# N - normal, R - registro de preços\n", + "\n", + "ce['modalidade'] = np.where((ce['modalidade'] == \"9\") & (ce['modalidade_processo_administrativo'] == \"N\"), np.nan, ce['modalidade'])\n", + "ce['modalidade'] = np.where((ce['modalidade'] == \"9\") & (ce['modalidade_processo_administrativo'] == \"R\"), np.nan, ce['modalidade'])\n", + "\n", + "# P - Regras de Organismos Internacionais\n", + "ce['modalidade'] = np.where(ce['modalidade_processo_administrativo'] == \"P\", '29', ce['modalidade'])\n", + "\n", + "# Piggyback procurement\n", + "\n", + "ce['carona'] = np.where(ce['modalidade_processo_administrativo'] == \"R\", 1, 0)\n", + "\n", + "# Format date columns\n", + "\n", + "ce['data_abertura'] = ce['data_abertura'].str[:10]\n", + "ce['data_edital'] = ce['data_edital'].str[:10]\n", + "ce['data_homologacao'] = ce['data_homologacao'].str[:10]\n", + "\n", + "# Create valor_corrigido\n", + "# If both 'valor' and 'valor_orcamento' are not null, choose the smaller of the two values.\n", + "# If 'valor' is not null but 'valor_orcamento' is null, use 'valor'.\n", + "# If 'valor' is null but 'valor_orcamento' is not null, use 'valor_orcamento'\n", + "# In this case, we only have 'valor_orcamento', so they will be the same\n", + "\n", + "ce['valor_orcamento'] = ce['valor_orcamento'].astype(float)\n", + "ce['valor'] = np.nan\n", + "\n", + "ce['valor_corrigido'] = ce.apply(lambda x: min(x['valor'], x['valor_orcamento'])\n", + " if pd.notnull(x['valor']) and pd.notnull(x['valor_orcamento'])\n", + " else x['valor'] if pd.notnull(x['valor'])\n", + " else x['valor_orcamento'], axis=1)\n", + "\n", + "# Assign state acronym to the 'sigla_uf'\n", + "ce['sigla_uf'] = 'CE'\n", + "\n", + "# Create a unique identifier for each purchase\n", + "\n", + "ce['id_licitacao_bd'] = ce['id_licitacao'] + ce['id_municipio'] + ce['ano'].str[2:4] + ce['sigla_uf']\n", + "\n", + "# String correction\n", + "\n", + "ce['descricao_objeto'] = ce['descricao_objeto'].str.replace('ADMINSITRATIVAS','ADMINISTRATIVAS')\n", + "\n", + "# Drop duplicates in all variables\n", + "\n", + "ce.drop_duplicates(inplace=True)\n", + "\n", + "# Replace duplicated values in the 'id_licitacao_bd' column with missing\n", + "\n", + "ce.loc[ce.duplicated(['id_licitacao_bd'], keep=False), 'id_licitacao_bd'] = np.nan\n", + "\n", + "# print(ce.duplicated(subset=['id_licitacao_bd']).value_counts(normalize=True)*100)\n", + "# 0.87% missing\n", + "\n", + "# Reorder columns\n", + "ce = ce.reindex(columns=ordem)\n", + "\n", + "# Save\n", + "ce.to_csv(os.path.join(path,\"output/licitacao_ce.csv\"), index=False, na_rep='', float_format='%.2f')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-jJ_uoQ5rYBy" + }, + "source": [ + "## PE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9S91sEwLrVPx" + }, + "outputs": [], + "source": [ + "#PE\n", + "\n", + "# Get a list of all CSV files\n", + "\n", + "all_files = glob.glob(os.path.join(path,\"input/PE/Licitações/licitacoesdetalhes_*.csv\"))\n", + "\n", + "# Initialize an empty list and loop through each CSV file\n", + "\n", + "all_df = []\n", + "for f in all_files:\n", + " df1 = pd.read_csv(f, sep=',', encoding='latin-1',dtype=str)\n", + " df1['arquivo'] = f.split('/')[-1]\n", + " all_df.append(df1)\n", + "\n", + "# Concatenate all DataFrames in the list into a single DataFrame\n", + "\n", + "pe = pd.concat(all_df, ignore_index=True, sort=True)\n", + "\n", + "# Extract the year from the file name\n", + "\n", + "pe['ano'] = pe['arquivo'].str[19:23]\n", + "\n", + "# List original variables to drop\n", + "\n", + "pe_drop = ['ADJUDICADA', 'ANOMODALIDADE','CODIGODESCRICAOOBJETO','CODIGOESTAGIOLICITACAO',\n", + " 'CODIGONATUREZA','CODIGOOBJETO','CODIGOSITUACAOLICITACAO', 'OBJETOCONFORMEEDITAL',\n", + " 'RAZAOSOCIAL','RESULTADOHABILITACAO','UG', 'ESPECIFICACAOOBJETO','NUMERODOCUMENTOAJUSTADO',\n", + " 'NUMEROMODALIDADE','NUMEROPROCESSO','TOTALADJUDICADOLICITANTE','LinkArquivo',\n", + " 'QTDELICITANTES', 'FUNDAMENTOLEGAL','DOTACAOORCAMENTARIA',\n", + " 'DATAPUBLICACAOHABILITACAO','arquivo']\n", + "\n", + "# Dictionary\n", + "\n", + "pe_rename = {'CODIGOUG':'id_unidade_gestora', 'CODIGOPL':'id_licitacao',\n", + " 'CARACTERISTICAOBJETO':'contratacao','ANOPROCESSO':'ano_processo',\n", + " 'SITUACAOLICITACAO':'situacao', 'TOTALADJUDICADOLICITACAO':'valor',\n", + " 'DESCRICAOOBJETO':'descricao_objeto','VALORORCAMENTOESTIMATIVO':'valor_orcamento',\n", + " 'DATAEMISSAOEDITAL':'data_edital','DATASESSAOABERTURA':'data_abertura',\n", + " 'DATAPUBLICACAOHOMOLOGACAO':'data_homologacao','ESTAGIOLICITACAO':'estagio',\n", + " 'NOMENATUREZA':'natureza_objeto','NOMEMODALIDADE':'modalidade'}\n", + "\n", + "natureza_objeto = {'Serviços de Engenharia':'1','Obras':'1','Concessão':'4','Permissão':'5',\n", + " 'Alienação de Bens':'6','Compras':'8','Outros Serviços':'9','Locação de Bens':'10'}\n", + "\n", + "situacao = {'Concluído':'1','Em Andamento':'2','Processo Revogado':'3','Processo Deserto':'4',\n", + " 'Processo Fracassado':'5','Processo Anulado':'6','Processo Sustado por Determinação do TCE':'7',\n", + " 'Processo Suspenso por Determinação Judicial':'8','Processo Suspenso por Decisão Administrativa':'9',\n", + " 'Edital Impugnado Aguardando Retificação':'10'}\n", + "\n", + "estagio = {'Processo Instaurado':'1','Edital Publicado':'2',\n", + " 'Licitantes Cadastrados':'3','Habilitação Concluída':'5',\n", + " 'Julgamento Concluído':'6','Processo Adjudicado / Homologado / Ratificado':'7'}\n", + "\n", + "contratacao = {'Integral sem Itens com Percentual Proposto':'5',\n", + " 'Por Lotes com Percentual Proposto':'6','Por Itens com Percentual Proposto':'7',\n", + " 'Integral com Itens com Percentual Proposto':'8',\n", + " 'Por Itens':'1','Por Lotes':'2','Integral sem Itens':'3',\n", + " 'Integral com Itens':'4'}\n", + "\n", + "# Drop and rename\n", + "\n", + "pe.drop(pe_drop, axis=1, inplace=True)\n", + "pe.rename(pe_rename, axis=1, inplace=True)\n", + "\n", + "# Read a CSV file containing municipality information\n", + "\n", + "ug = pd.read_csv(os.path.join(path, \"input/PE/unidadesjurisdicionadas.csv\"), sep=',', encoding='latin-1',dtype=str)\n", + "\n", + "ug_drop = ['CODIGOTCE', 'ESFERA', 'PODER','UNIDADEFEDERATIVA', 'NATUREZA', 'TIPOPESSOAJURIDICA', 'ORGAO', 'MUNICIPIO', 'SIGLA', 'SITUACAO', 'CNPJ']\n", + "ug_rename = {'CODIGOMUNICIPIO':'id_municipio_tce','ID_UNIDADE_GESTORA': 'id_unidade_gestora'}\n", + "\n", + "ug.drop(ug_drop, axis=1, inplace=True)\n", + "ug.rename(columns = ug_rename, inplace=True)\n", + "\n", + "# Merge on id_unidade_gestora to get id_municipio_tce (TCE code)\n", + "\n", + "pe = pd.merge(pe, ug, how='left',left_on=['id_unidade_gestora'], right_on=['id_unidade_gestora'], indicator=True)\n", + "\n", + "pe.drop('_merge',axis=1, inplace=True)\n", + "\n", + "# Merge on id_municipio_tce to get id_municipio (IBGE code)\n", + "\n", + "pe = pd.merge(pe, municipio, how='left', left_on='id_municipio_tce', right_on='id_municipio_tce')\n", + "\n", + "pe.drop(['nome','id_municipio_6','id_municipio_tce'],axis=1, inplace=True)\n", + "\n", + "# Drop duplicates: suppliers level to bid level\n", + "\n", + "pe.drop_duplicates(subset=['id_licitacao', 'id_municipio','id_unidade_gestora', 'ano'], inplace=True)\n", + "\n", + "# Create a unique identifier for each purchase\n", + "\n", + "pe['id_licitacao_bd'] = pe['id_licitacao'] + pe['id_unidade_gestora'] + pe['sigla_uf']\n", + "\n", + "# Format date columns\n", + "\n", + "pe['data_edital'] = pe['data_edital'].str[:10]\n", + "pe['data_abertura'] = pe['data_abertura'].str[:10]\n", + "pe['data_homologacao'] = pe['data_homologacao'].str[:10]\n", + "\n", + "# Create valor_corrigido\n", + "# If both 'valor' and 'valor_orcamento' are not null, choose the smaller of the two values.\n", + "# If 'valor' is not null but 'valor_orcamento' is null, use 'valor'.\n", + "# If 'valor' is null but 'valor_orcamento' is not null, use 'valor_orcamento'\n", + "\n", + "pe['valor'] = pe['valor'].astype(float)\n", + "pe['valor_orcamento'] = pe['valor_orcamento'].astype(float)\n", + "\n", + "pe['valor_corrigido'] = pe.apply(lambda x: min(x['valor'], x['valor_orcamento'])\n", + " if pd.notnull(x['valor']) and pd.notnull(x['valor_orcamento'])\n", + " else x['valor'] if pd.notnull(x['valor'])\n", + " else x['valor_orcamento'], axis=1)\n", + "\n", + "# Replace by dictionary\n", + "\n", + "pe['natureza_objeto'] = pe['natureza_objeto'].replace(natureza_objeto, regex=True)\n", + "pe['situacao'] = pe['situacao'].replace(situacao, regex=True)\n", + "pe['estagio'] = pe['estagio'].replace(estagio, regex=True)\n", + "pe['contratacao'] = pe['contratacao'].replace(contratacao, regex=True)\n", + "\n", + "pe['modalidade'] = pe['modalidade'].replace(['Convite','Tomada de Preços','Concorrência','Pregão Presencial',\n", + " 'Lei 13.303/2016 - Pregão Presencial','Pregão Eletrônico',\n", + " 'Lei 13.303/2016 - Pregão Eletrônico','Leilão',\n", + " 'Dispensa','Lei 13.303/2016 - Dispensa','Inexigibilidade',\n", + " 'Lei 13.303/2016 - Inexigibilidade','Regime Diferenciado de Contratações (RDC)',\n", + " 'Procedimento de Licitação Próprio'],\n", + " ['1','2','3','5','5','6','6','7','8','8','10','10','12',''])\n", + "\n", + "pe['modalidade'] = pe['modalidade'].replace(['BIRD - Contratação Direta (CD)',\n", + " 'BIRD - Comparação de Preços (CP)',\n", + " 'BIRD - Seleção Baseada na Qualidade e Custo (SBQC)',\n", + " 'BIRD - Licitação Pública Internacional (ICB)',\n", + " 'BIRD - Consultor Individual (CI)',\n", + " 'BIRD - Licitação Pública Nacional (NCB)',\n", + " 'BIRD - Seleção Pelo Menor Custo (SMC)',\n", + " 'BIRD - Seleção Baseada nas Qualificações do Consultor (SQC)',\n", + " 'BIRD - Banco Internacional para Reconstrução e Desenvolvimento'],['13','14','15','16','17','18','19','20',''])\n", + "\n", + "pe['modalidade'] = pe['modalidade'].replace(['BID - Contratação Direta (CD)',\n", + " 'BID - Comparação de Preços (CP)',\n", + " 'BID - Seleção Baseada na Qualidade e Custo (SBQC)',\n", + " 'BID - Consultor Individual (CI)',\n", + " 'BID - Licitação Pública Nacional (LPN)',\n", + " 'BID - Licitação Pública Internacional (LPI)',\n", + " 'BID - Seleção Baseada no Menor Custo (SBMC)',\n", + " 'BID - Seleção Baseada nas Qualificações do Consultor (SQC)'], ['21','22','23','24','25','26','27','28'])\n", + "\n", + "# Reorder columns\n", + "pe = pe.reindex(columns=ordem)\n", + "\n", + "# Save\n", + "pe.to_csv(os.path.join(path, \"output/licitacao_pe.csv\"), index=False, na_rep='', float_format='%.2f')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hfvWowZ4mLlw" + }, + "source": [ + "## MG" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZTVADPJDia1g" + }, + "outputs": [], + "source": [ + "# List municipalities to loop\n", + "\n", + "municipio_mg = municipio.query('sigla_uf==\"MG\"')\n", + "municipios_mg = municipio_mg['id_municipio'].tolist()\n", + "\n", + "# Rename and drop variables from different files\n", + "\n", + "# Competitive procurement - licitacao.csv\n", + "\n", + "mg1_rename = {'dat_pub_edital':'data_edital','seq_unidade':'id_unidade_gestora','dsc_forma_pagamento':'forma_pagamento',\n", + " 'dsc_ind_exclusiva_Micro':'exclusiva_micro_pequena','dsc_ind_pref_micro':'preferencia_micro_pequena','seq_licitacao':'id_licitacao',\n", + " 'seq_orgao':'orgao','dsc_modalidade':'modalidade','dsc_nat_objeto':'natureza_objeto','dsc_nat_processo':'natureza_processo',\n", + " 'dsc_objeto_licitacao':'descricao_objeto','num_mes_referencia':'mes','num_ano_referencia':'ano', 'dsc_tipo_licitacao':'tipo',\n", + " 'dsc_tipo_cadastro':'tipo_cadastro','num_convidados':'quantidade_convidados', 'dat_abert_proc_adm':'data_abertura',\n", + " 'num_ano_processo':'ano_processo'}\n", + "\n", + "mg1_drop = ['dat_receb_prev_doc','dat_veic_pub_001','dat_veic_pub_002','dsc_clausula_prorrog','dsc_criterio_adjud',\n", + " 'num_versao_arq','dat_conv_edital','dsc_criterio_aceit','vlr_rsp_nao_processado',\n", + " 'vlr_rsp_processado','dsc_ind_reserv_micro','dsc_ind_sub_contrato_micro','num_exercicio_edital',\n", + " 'num_modalidade','num_prazo_execucao','num_processo','dsc_processo_lote', 'dsc_veic_pub_001','dsc_veic_pub_002',\n", + " 'dsc_unid_medida','dsc_regime_exec_obra']\n", + "\n", + "# Resources from competitive procurement - recLicitacao.csv\n", + "\n", + "mg2_rename = {'seq_licitacao':'id_licitacao','seq_orgao':'orgao', 'dsc_funcao':'funcao', 'dsc_subfuncao':'subfuncao',\n", + " 'dsc_programa':'programa','dsc_acao':'acao','num_mes_referencia':'mes','num_ano_referencia':'ano',\n", + " 'dsc_nat_despesa':'natureza_despesa','vlr_recurso':'valor_orcamento'}\n", + "\n", + "mg2_drop = ['seq_rec_licitacao','dsc_dotacao','dsc_subacao','dsc_fonte_recurso','num_versao_arq']\n", + "\n", + "# Itens from competitive procurement - homologLicitacao.csv\n", + "# Get values - quantity * unitary price\n", + "\n", + "mg3_rename = {'seq_item_licitacao':'id_item','seq_licitacao':'id_licitacao','seq_orgao':'orgao',\n", + " 'num_ano_referencia':'ano', 'vlr_unitario':'valor_unitario', 'num_quant_item':'quantidade'}\n", + "\n", + "mg3_drop = ['seq_hom_licitacao','num_mes_referencia', 'num_versao_arq', 'vlr_pct_desconto',\n", + " 'vlr_pct_tax_adm', 'vlr_global', 'num_doc_vencedor','nom_vencedor','num_versao_arq']\n", + "\n", + "# Non-competitive procurement - dispensa.csv\n", + "\n", + "mg4_rename = {'dat_pub_termo':'data_publicacao_dispensa','seq_unidade':'id_unidade_gestora','dsc_forma_pagamento':'forma_pagamento',\n", + " 'seq_dispensa':'id_dispensa','seq_orgao':'orgao','dsc_tipo_processo':'modalidade','dsc_nat_objeto':'natureza_objeto',\n", + " 'dsc_objeto':'descricao_objeto','num_mes_referencia':'mes','num_ano_referencia':'ano','dsc_tipo_cadastro':'tipo_cadastro',\n", + " 'dat_abertura':'data_abertura','num_ano_processo':'ano_processo'}\n", + "\n", + "mg4_drop = ['dsc_ind_processo_lote','dsc_justificativa','dsc_razao','dsc_veiculo_pub',\n", + " 'num_versao_arq','vlr_rsp_proc','num_processo','vlr_rsp_nao_proc']\n", + "\n", + "# Resources from non-competitive procurement - recDispensa.csv\n", + "\n", + "mg5_rename = {'seq_dispensa':'id_dispensa','seq_orgao':'orgao', 'dsc_funcao':'funcao', 'dsc_subfuncao':'subfuncao',\n", + " 'dsc_programa':'programa','dsc_acao':'acao','num_mes_referencia':'mes','num_ano_referencia':'ano',\n", + " 'dsc_nat_despesa':'natureza_despesa','vlr_recurso':'valor_orcamento'}\n", + "\n", + "mg5_drop = ['seq_rec_dispensa','dsc_dotacao','dsc_subacao','dsc_fonte_recurso','num_versao_arq']\n", + "\n", + "# Itens from non-competitive procurement - fornDispensa.csv\n", + "# Get values - quantity * unitary price\n", + "\n", + "mg6_rename = {'seq_item_dispensa':'id_item','seq_dispensa':'id_dispensa','seq_orgao':'orgao',\n", + " 'num_ano_referencia':'ano', 'vlr_item':'valor_unitario', 'num_quant_item':'quantidade'}\n", + "\n", + "mg6_drop = ['seq_forn_dispensa','num_inscr_estadual', 'dsc_sigla_uf','num_mes_referencia',\n", + " 'num_certidao_inss', 'dat_emi_cert_inss', 'dat_emi_cert_fgts', 'dat_val_cert_fgts',\n", + " 'dat_val_cert_inss', 'num_cert_fgts', 'num_cndt','dat_emi_cndt','dat_val_cndt',\n", + " 'num_doc_fornecedor','dsc_nom_fornecedor','num_versao_arq']\n", + "\n", + "# Dictionary\n", + "\n", + "modalidade = {'1 - CONVITE':'1','1 - DISPENSA':'8','2 - TOMADA DE PREÇOS':'2',\n", + " '2 - INEXIGIBILIDADE':'10','3 - INEXIGIBILIDADE POR CREDENCIAMENTO/CHAMADA PÚBLICA':'10',\n", + " '3 - CONCORRÊNCIA':'3','4 - CONCURSO':'11','4 - DISPENSA POR CHAMADA PÚBLICA':'8',\n", + " '5 - PREGÃO PRESENCIAL':'5','6 - PREGÃO ELETRÔNICO':'6','7 - LEILÃO':'7'}\n", + "\n", + "natureza_processo = {'1 - NORMAL':'1','2 - REGISTRO DE PREÇOS':'2',\n", + " '3 - CREDENCIAMENTO/ CHAMADA PÚBLICA':'3',\n", + " '3 - CREDENCIAMENTO/CHAMADA PÚBLICA':'3'}\n", + "\n", + "natureza_objeto = {'1 - OBRAS E SERVIÇOS DE ENGENHARIA':'1','2 - COMPRAS E OUTROS SERVIÇOS':'2',\n", + " '3 - LOCAÇÃO DE IMÓVEIS':'3','3 - LOCAÇÃO DE IMOVÉIS':'3','4 - CONCESSÃO':'4',\n", + " '5 - PERMISSÃO':'5','6 - ALIENAÇÃO DE BENS':'6',\n", + " '7 - COMPRAS PARA OBRAS E/OU SERVIÇOS DE ENGENHARIA':'7',' - ':''}\n", + "\n", + "tipo = {'1 - MENOR PREÇO':'1','2 - MELHOR TÉCNICA':'2','3 - TÉCNICA E PREÇO':'3',\n", + " '4 - MAIOR LANCE OU OFERTA':'4',' - ':''}\n", + "\n", + "tipo_cadastro = {'1 - CADASTRO INICIAL':'1','2 - RETIFICAÇÃO':'2'}\n", + "\n", + "preferencia_micro_pequena = {'SIM':'1','NÃO':'0'}" + ] + }, + { + "cell_type": "code", + "source": [ + "# List years\n", + "\n", + "anos_mg = ['2014','2015','2016','2017','2018','2019','2020','2021']\n", + "\n", + "# MG folder\n", + "folder = os.path.join(path,\"input/MG\")\n", + "\n", + "# Initialize an empty list and loop through each CSV file\n", + "\n", + "all_df_mg = []\n", + "for a in anos_mg:\n", + " for m in municipios_mg:\n", + " zip_files = os.path.join(folder, '{}/licitacao_{}.zip'.format(a,a))\n", + " if a in ['2014', '2015', '2016']:\n", + " arquivo1 = f'licitacao/{m}/{a}.{m}.licitacao.licitacao.csv'\n", + " arquivo2 = f'licitacao/{m}/{a}.{m}.licitacao.regadesao.csv'\n", + " arquivo3 = f'licitacao/{m}/{a}.{m}.licitacao.recLicitacao.csv'\n", + " arquivo4 = f'licitacao/{m}/{a}.{m}.licitacao.homologLicitacao.csv'\n", + " arquivo5 = f'licitacao/{m}/{a}.{m}.licitacao.dispensa.csv'\n", + " arquivo6 = f'licitacao/{m}/{a}.{m}.licitacao.recDispensa.csv'\n", + " arquivo7 = f'licitacao/{m}/{a}.{m}.licitacao.fornDispensa.csv'\n", + "\n", + " elif a == '2017':\n", + " arquivo1 = f'{a}/licitacao/{m}/{a}.{m}.licitacao.licitacao.csv'\n", + " arquivo2 = f'{a}/licitacao/{m}/{a}.{m}.licitacao.regadesao.csv'\n", + " arquivo3 = f'{a}/licitacao/{m}/{a}.{m}.licitacao.recLicitacao.csv'\n", + " arquivo4 = f'{a}/licitacao/{m}/{a}.{m}.licitacao.homologLicitacao.csv'\n", + " arquivo5 = f'{a}/licitacao/{m}/{a}.{m}.licitacao.dispensa.csv'\n", + " arquivo6 = f'{a}/licitacao/{m}/{a}.{m}.licitacao.recDispensa.csv'\n", + " arquivo7 = f'{a}/licitacao/{m}/{a}.{m}.licitacao.fornDispensa.csv'\n", + " else:\n", + " arquivo1 = f'{a}.{m}.licitacao.licitacao.csv'\n", + " arquivo2 = f'{a}.{m}.licitacao.regadesao.csv'\n", + " arquivo3 = f'{a}.{m}.licitacao.recLicitacao.csv'\n", + " arquivo4 = f'{a}.{m}.licitacao.homologLicitacao.csv'\n", + " arquivo5 = f'{a}.{m}.licitacao.dispensa.csv'\n", + " arquivo6 = f'{a}.{m}.licitacao.recDispensa.csv'\n", + " arquivo7 = f'{a}.{m}.licitacao.fornDispensa.csv'\n", + "\n", + " with ZipFile(zip_files) as z:\n", + " # Competitive procurement\n", + " with z.open(arquivo1) as f:\n", + " mg1 = pd.read_csv(f,sep=';', encoding='utf-8', dtype=str)\n", + " mg1['id_municipio'] = m\n", + " mg1['sigla_uf'] = \"MG\"\n", + "\n", + " # Drop and rename\n", + " mg1.rename(mg1_rename, axis=1, inplace=True)\n", + " mg1.drop(mg1_drop, axis=1, inplace=True)\n", + "\n", + " # Format date columns\n", + " mg1['data_edital']=mg1['data_edital'].str[6:]+ '-' + mg1['data_edital'].str[3:5] + '-' + mg1['data_edital'].str[:2]\n", + " mg1['data_abertura']=mg1['data_abertura'].str[6:]+ '-' + mg1['data_abertura'].str[3:5] + '-' + mg1['data_abertura'].str[:2]\n", + "\n", + " # Create a unique identifier for each purchase\n", + " mg1['id_licitacao_bd'] = mg1['id_licitacao'] + mg1['id_unidade_gestora'] + mg1['sigla_uf']\n", + "\n", + " # Resources from competitive procurement\n", + " with z.open(arquivo3) as f:\n", + " mg2 = pd.read_csv(f,sep=';', encoding='utf-8', dtype=str)\n", + " mg2['id_municipio'] = m\n", + "\n", + " # Drop and rename\n", + " mg2.rename(mg2_rename, axis=1, inplace=True)\n", + " mg2.drop(mg2_drop, axis=1, inplace=True)\n", + "\n", + " # Format value columns\n", + " mg2['valor_orcamento'] = mg2['valor_orcamento'].replace('################', np.nan)\n", + " mg2['valor_orcamento']=mg2['valor_orcamento'].astype(float)\n", + "\n", + " # Create a unique identifier for each purchase\n", + " mg2 = mg2.groupby(['id_municipio','orgao','ano','mes','id_licitacao'])['valor_orcamento'].sum().to_frame('valor_orcamento').reset_index()\n", + "\n", + " # Itens from competitive procurement\n", + " with z.open(arquivo4) as f:\n", + " mg3 = pd.read_csv(f,sep=';', encoding='utf-8', dtype=str)\n", + " mg3['id_municipio'] = m\n", + "\n", + " # Drop and rename\n", + " mg3.rename(columns=mg3_rename, inplace=True)\n", + " mg3.drop(mg3_drop, axis=1, inplace=True)\n", + "\n", + " # Format value columns\n", + " mg3['quantidade'] = mg3['quantidade'].astype(float)\n", + " mg3['valor_unitario'] = mg3['valor_unitario'].astype(float)\n", + "\n", + " # Create valor = unitary price * quantity\n", + " mg3['valor'] = np.where((mg3['valor_unitario'].notnull()) | (mg3['quantidade'].notnull()),\n", + " mg3['quantidade'] * mg3['valor_unitario'] , np.nan)\n", + "\n", + " # Group by tender - id_licitacao\n", + " mg3 = mg3.groupby(['id_municipio','orgao','ano','id_licitacao'])['valor'].sum().to_frame('valor').reset_index()\n", + "\n", + " # Non competitive procurement (dispensa/inexibilidade)\n", + " with z.open(arquivo5) as f:\n", + " mg4 = pd.read_csv(f,sep=';', encoding='utf-8', dtype='string')\n", + " mg4['id_municipio'] = m\n", + " mg4['sigla_uf'] = \"MG\"\n", + "\n", + " # Drop and rename\n", + " mg4.rename(mg4_rename, axis=1, inplace=True)\n", + " mg4.drop(mg4_drop, axis=1, inplace=True)\n", + "\n", + " # Format date columns\n", + " mg4['data_publicacao_dispensa']=mg4['data_publicacao_dispensa'].str[6:]+ '-' + mg4['data_publicacao_dispensa'].str[3:5] + '-' + mg4['data_publicacao_dispensa'].str[:2]\n", + " mg4['data_abertura']=mg4['data_abertura'].str[6:]+ '-' + mg4['data_abertura'].str[3:5] + '-' + mg4['data_abertura'].str[:2]\n", + "\n", + " # Create a unique identifier for each purchase\n", + " mg4['id_licitacao_bd'] = mg4['id_dispensa'] + mg4['id_unidade_gestora'] + mg4['sigla_uf']\n", + "\n", + " # Resources from non-competitive procurement\n", + " with z.open(arquivo6) as f:\n", + " mg5 = pd.read_csv(f,sep=';', encoding='utf-8', dtype=str)\n", + " mg5['id_municipio'] = m\n", + "\n", + " # Drop and rename\n", + " mg5.rename(mg5_rename, axis=1, inplace=True)\n", + " mg5.drop(mg5_drop, axis=1, inplace=True)\n", + "\n", + " # Format value columns\n", + " mg5['valor_orcamento'] = mg5['valor_orcamento'].replace('################', np.nan)\n", + " mg5['valor_orcamento']=mg5['valor_orcamento'].astype(float)\n", + "\n", + " # Group by tender - id_licitacao\n", + " mg5 = mg5.groupby(['id_municipio','orgao','ano','mes','id_dispensa'])['valor_orcamento'].sum().to_frame('valor_orcamento').reset_index()\n", + "\n", + " # Itens from non-competitive procurement\n", + " with z.open(arquivo7) as f:\n", + " mg6 = pd.read_csv(f,sep=';', encoding='utf-8', dtype=str)\n", + " mg6['id_municipio'] = m\n", + " mg6.rename(columns=mg6_rename, inplace=True)\n", + " mg6.drop(mg6_drop, axis=1, inplace=True)\n", + "\n", + " # Format value columns\n", + " mg6['valor_unitario'] = mg6['valor_unitario'].astype(float)\n", + " mg6['quantidade'] = mg6['quantidade'].astype(float)\n", + "\n", + " # Create valor = unitary price * quantity\n", + " mg6['valor'] = np.where((mg6['valor_unitario'].notnull()) | (mg6['quantidade'].notnull()),\n", + " mg6['quantidade'] * mg6['valor_unitario'] , np.nan)\n", + "\n", + " # Group by tender - id_dispensa\n", + " mg6 = mg6.groupby(['id_municipio','orgao','ano','id_dispensa'])['valor'].sum().to_frame('valor').reset_index()\n", + "\n", + " # Merge competitive procurement files\n", + " merge1 = pd.merge(mg1, mg2, how='left', left_on=['id_municipio','id_licitacao','ano','mes','orgao'],\n", + " right_on=['id_municipio','id_licitacao','ano','mes','orgao'])\n", + "\n", + " merge1 = pd.merge(merge1, mg3, how='left', left_on=['id_municipio','id_licitacao','ano','orgao'],\n", + " right_on=['id_municipio','id_licitacao','ano','orgao'])\n", + "\n", + " # Merge non competitive procurement files\n", + " merge2 = pd.merge(mg4, mg5, how='left', left_on=['id_municipio','id_dispensa','ano','mes','orgao'],\n", + " right_on=['id_municipio','id_dispensa','ano','mes','orgao'])\n", + "\n", + " merge2 = pd.merge(merge2, mg6, how='left', left_on=['id_municipio','id_dispensa','ano','orgao'],\n", + " right_on=['id_municipio','id_dispensa','ano','orgao'])\n", + "\n", + " # Append competitive and non competitive\n", + " mg = merge1.append([merge2],ignore_index=True)\n", + "\n", + " # Append all\n", + " all_df_mg.append(mg)\n", + "\n", + "# Concatenate all DataFrames in the list into a single DataFrame\n", + "\n", + "mg = pd.concat(all_df_mg, ignore_index=True, sort=True)\n", + "\n", + "# Create valor_corrigido\n", + "# If both 'valor' and 'valor_orcamento' are not null, choose the smaller of the two values.\n", + "# If 'valor' is not null but 'valor_orcamento' is null, use 'valor'.\n", + "# If 'valor' is null but 'valor_orcamento' is not null, use 'valor_orcamento'\n", + "\n", + "mg['valor_corrigido'] = mg.apply(lambda x: min(x['valor'], x['valor_orcamento'])\n", + " if pd.notnull(x['valor']) and pd.notnull(x['valor_orcamento'])\n", + " else x['valor'] if pd.notnull(x['valor'])\n", + " else x['valor_orcamento'], axis=1)\n", + "\n", + "floats = ['vlr_empenhado','vlr_liquidado','vlr_pago']\n", + "mg[floats] = mg[floats].astype(float)\n", + "\n", + "# Replace by dictionary\n", + "\n", + "mg['modalidade'] = mg['modalidade'].replace(modalidade, regex=True)\n", + "mg['natureza_processo'] = mg['natureza_processo'].replace(natureza_processo, regex=True)\n", + "mg['natureza_objeto'] = mg['natureza_objeto'].replace(natureza_objeto, regex=True)\n", + "mg['tipo'] = mg['tipo'].replace(tipo, regex=True)\n", + "mg['tipo_cadastro'] = mg['tipo_cadastro'].replace(tipo_cadastro, regex=True)\n", + "mg['preferencia_micro_pequena'] = mg['preferencia_micro_pequena'].replace(preferencia_micro_pequena, regex=True)\n", + "mg['exclusiva_micro_pequena'] = mg['exclusiva_micro_pequena'].replace(preferencia_micro_pequena, regex=True)\n", + "\n", + "# Reorder columns\n", + "mg = mg.reindex(columns=ordem)\n", + "\n", + "# Save\n", + "mg.to_csv(os.path.join(path, \"output/licitacao_mg.csv\"), index=False, na_rep='', float_format='%.2f')" + ], + "metadata": { + "id": "MTbjj_NqBZcU" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5STSQIbsg0ZN" + }, + "source": [ + "## PR" + ] + }, + { + "cell_type": "code", + "source": [ + "# List variables to drop and rename\n", + "\n", + "pr_drop = ['DataReferencia','dsClausulaProrrogacao','dsRegimeExecucaoLicitacao',\n", + " 'nmEntidade','nrEditalOrigem','nrLicitacao','nranoEditalOrigem',\n", + " 'ultimoEnvioSIMAMNesteExercicio','dtEnvio','nmMunicipio']\n", + "\n", + "pr_rename = {'cdIBGE':'id_municipio', 'dsAvaliacaoLicitacao':'tipo', 'dsModalidadeLicitacao':'modalidade',\n", + " 'dsNaturezaLicitacao':'natureza_processo','idLicitacao':'id_licitacao', 'dsObjeto':'descricao_objeto',\n", + " 'dsClassificacaoObjetoLicitacao':'natureza_objeto','dsTipoSituacaoLicitacao':'situacao',\n", + " 'dtAbertura':'data_abertura','dtEdital':'data_edital','idPessoa':'id_unidade_gestora','nrAnoLicitacao':'ano',\n", + " 'vlLicitacao':'valor_orcamento','dtOcorrencia':'data_homologacao'}\n", + "\n", + "item_pr_rename = {'cdIBGE':'id_municipio', 'idlicitacao':'id_licitacao', 'idPessoa':'id_unidade_gestora',\n", + " 'nrAnoLicitacao':'ano', 'nrQuantidadeVencedorLicitacao':'quantidade',\n", + " 'vlLicitacaoVencedorLicitacao':'valor_vencedor', 'nrItem':'numero','nrLote':'numero_lote',\n", + " 'nrClassificacao':'numero_classificacao'}\n", + "\n", + "# Dictionary\n", + "\n", + "modalidade = {'Convite':'1','Tomada de Preços':'2','Concorrência':'3','Pregão':'4',\n", + " 'Leilão':'7','Processo Dispensa':'8','Processo Inexigibilidade':'10',\n", + " 'Concurso':'11','Regime Diferenciado de Contratações - RDC':'12',\n", + " 'Lei Ordinária nº 13.303/2016':''}\n", + "\n", + "natureza_objeto = {'Obras e Serviços de Engenharia':'1','Compras e Serviços':'2',\n", + " 'Alienação de Bens':'6','Concessão de Direito Real de Uso':'11',\n", + " 'Cessão de Direitos':'12'}\n", + "\n", + "tipo = {'Menor Preço - Item': '1', 'Menor Preço - Lote': '1',\n", + " 'Melhor Técnica - Item':'2','Melhor Técnica - Lote':'2',\n", + " 'Técnica e Preço - Item':'3','Técnica e Preço - Lote':'3',\n", + " 'Maior Lance ou Oferta - Item': '4', 'Maior Lance ou Oferta - Lote': '4',\n", + " 'Maior Retorno Econômico':'5','Dispensa/Inexigibilidade': '13'}\n", + "\n", + "situacao = {'Homologada':'1','Andamento':'2','Andamento – Nova Data de Abertura':'2',\n", + " 'Revogada':'3','Deserta':'4','Fracassada':'5','Anulada':'6'}\n", + "\n", + "natureza_processo = {'Lei Ordinária nº 13.303/2016 - Dados Abertos, Lei Ordinária nº 13.303/2016 - Dados Sigilosos, Normal, ':'4',\n", + " 'Lei Ordinária nº 13.303/2016 - Dados Abertos, Lei Ordinária nº 13.303/2016 - Dados Sigilosos, Normal, Proc.de Disp.Inc I e II do art.24 da Lei 8666/93, ':'5',\n", + " 'Credenciamento, Lei Ordinária nº 13.303/2016 - Dados Abertos, Lei Ordinária nº 13.303/2016 - Dados Sigilosos, Normal, Registro de Preços, ':'6',\n", + " 'Normal, Registro de Preços, ':'6','Lei Ordinária nº 13.303/2016 - Dados Abertos, Lei Ordinária nº 13.303/2016 - Dados Sigilosos, ':''}" + ], + "metadata": { + "id": "x7xN3KE1RKLz" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "x0cX3LwsfxNs" + }, + "outputs": [], + "source": [ + "# List municipalities to loop\n", + "# Drop municipalities for which we couldn't transform xml in csv\n", + "\n", + "municipio_pr = municipio.query('sigla_uf==\"PR\"')\n", + "\n", + "municipio_pr = municipio_pr.query('id_municipio_6 != \"411915\" & \\\n", + " id_municipio_6 != \"411370\" & \\\n", + " id_municipio_6 != \"411535\" & \\\n", + " id_municipio_6 != \"411710\" & \\\n", + " id_municipio_6 != \"412627\" & \\\n", + " id_municipio_6 != \"410140\" & \\\n", + " id_municipio_6 != \"410350\"')\n", + "\n", + "municipios_pr = municipio_pr['id_municipio_6'].tolist()\n", + "\n", + "anos_pr = ['2013','2014','2015','2016','2017','2018','2019','2020','2021']\n", + "\n", + "# Initialize an empty list and loop through each CSV file\n", + "\n", + "all_files_pr = []\n", + "for a in anos_pr:\n", + " for m in municipios_pr:\n", + " exec(\"path_lic = '/content/gdrive/MyDrive/ComprasPublicas_Brasil/input/PR/{}/Licitacao/{}/{}_{}_Licitacao.csv'\".format(a, m, a, m))\n", + "\n", + " pr = pd.read_csv(path_lic, sep=',', encoding='utf-8', dtype=str)\n", + "\n", + " # Drop and rename\n", + " pr.drop(pr_drop, axis=1, inplace=True)\n", + " pr.rename(pr_rename, axis=1, inplace=True)\n", + "\n", + " # Format date columns\n", + " pr['data_edital'] = pr['data_edital'].str[:10]\n", + " pr['data_abertura'] = pr['data_abertura'].str[:10]\n", + " pr['data_homologacao'] = pr['data_homologacao'].str[:10]\n", + "\n", + " # Merge id_municipio 6 and 7 digits\n", + " pr['id_municipio'] = pr['id_municipio'].astype(str)\n", + " pr = pd.merge(pr, municipio, how='left', left_on='id_municipio', right_on='id_municipio_6')\n", + "\n", + " pr.drop(['id_municipio_x','id_municipio_6','nome'],axis=1, inplace=True)\n", + " pr.rename({'id_municipio_y':'id_municipio'},axis=1, inplace=True)\n", + "\n", + " # Create a unique identifier for each purchase\n", + " pr['id_licitacao_bd'] = pr['id_licitacao'] + pr['id_unidade_gestora'] + pr['sigla_uf']\n", + "\n", + " # Replace by dictionary\n", + "\n", + " pr['modalidade'] = pr['modalidade'].replace(modalidade)\n", + " pr['natureza_objeto'] = pr['natureza_objeto'].replace(natureza_objeto)\n", + " pr['tipo'] = pr['tipo'].replace(tipo)\n", + " pr['situacao'] = pr['situacao'].replace(situacao)\n", + " pr['natureza_processo'] = pr['natureza_processo'].replace(natureza_processo)\n", + "\n", + " # Merge with files LicitacaoVencedor to get the variable 'valor'\n", + " # Goal: aggregate unitary price * quantity by tender\n", + "\n", + " exec(\"path_lic_venc = '/content/gdrive/MyDrive/ComprasPublicas_Brasil/input/PR/{}/Licitacao/{}/{}_{}_LicitacaoVencedor.csv'\".format(a, m, a, m))\n", + "\n", + " item_pr = pd.read_csv(path_lic_venc, sep=',', encoding='utf-8', dtype=str,\n", + " usecols = ['nrAnoLicitacao','cdIBGE','idlicitacao','idPessoa',\n", + " 'nrQuantidadeVencedorLicitacao','vlLicitacaoVencedorLicitacao',\n", + " 'nrLote','nrItem','nrClassificacao'])\n", + "\n", + " item_pr.rename(item_pr_rename, axis=1, inplace=True)\n", + "\n", + " # Merge id_municipio 6 and 7 digits\n", + " item_pr = pd.merge(item_pr, municipio, how='left', left_on='id_municipio', right_on='id_municipio_6')\n", + "\n", + " item_pr.drop(['id_municipio_x','id_municipio_6','nome'],axis=1, inplace=True)\n", + " item_pr.rename({'id_municipio_y':'id_municipio'},axis=1, inplace=True)\n", + "\n", + " # Drop non suppliers\n", + " item_pr['min_classificacao'] = item_pr.groupby(['ano','id_municipio','id_licitacao','numero_lote','numero'])['numero_classificacao'].transform('min')\n", + " item_pr['diff'] = np.where(item_pr['numero_classificacao'] == item_pr['min_classificacao'], 0, 1)\n", + " item_pr = item_pr[item_pr['diff']==0]\n", + "\n", + " # Create a unique identifier for each purchase\n", + " item_pr['id_licitacao_bd'] = item_pr['id_licitacao'] + item_pr['id_unidade_gestora'] + item_pr['sigla_uf']\n", + "\n", + " # Format value columns\n", + " item_pr['valor_vencedor'] = item_pr['valor_vencedor'].astype(float)\n", + " item_pr['quantidade'] = item_pr['quantidade'].astype(float)\n", + "\n", + " # Create valor = unitary price * quantity\n", + " item_pr['valor_total'] = np.where((item_pr['quantidade'].isnull()) | (item_pr['valor_vencedor'].isnull()),\n", + " np.nan, item_pr['quantidade']*item_pr['valor_vencedor'])\n", + "\n", + " item_pr['valor_total'] = item_pr['valor_total'].astype(float)\n", + "\n", + " # Group by tender (id_licitacao_bd)\n", + " valor_total = item_pr.groupby(['ano','id_municipio','id_licitacao_bd'])['valor_vencedor','valor_total'].sum().reset_index()\n", + "\n", + " # Merge with tender dataframe\n", + " pr = pd.merge(pr,valor_total , how='left', left_on=['ano','id_municipio','id_licitacao_bd'], right_on=['ano','id_municipio','id_licitacao_bd'])\n", + "\n", + " # Append all\n", + " all_files_pr.append(pr)\n", + "\n", + "# Concatenate all DataFrames in the list into a single DataFrame\n", + "\n", + "pr = pd.concat(all_files_pr, ignore_index=True, sort=True)\n", + "\n", + "# Deal with outliers\n", + "\n", + "pr['valor_orcamento'] = pr['valor_orcamento'].astype(float)\n", + "pr['percentile'] = pr['valor_orcamento'].quantile(.999)\n", + "pr['valor_orcamento'] = np.where(pr['valor_orcamento'] > pr['percentile'], np.nan, pr['valor_orcamento'])\n", + "\n", + "pr.rename({'valor_total':'valor'},axis=1,inplace=True)\n", + "pr['percentile2'] = pr['valor'].quantile(.999)\n", + "pr['valor'] = np.where(pr['valor'] > pr['percentile2'], np.nan, pr['valor'])\n", + "\n", + "# Create valor_corrigido\n", + "# If both 'valor' and 'valor_orcamento' are not null, choose the smaller of the two values.\n", + "# If 'valor' is not null but 'valor_orcamento' is null, use 'valor'.\n", + "# If 'valor' is null but 'valor_orcamento' is not null, use 'valor_orcamento'\n", + "\n", + "pr['valor_corrigido'] = pr.apply(lambda x: min(x['valor'], x['valor_orcamento'])\n", + " if pd.notnull(x['valor']) and pd.notnull(x['valor_orcamento'])\n", + " else x['valor'] if pd.notnull(x['valor'])\n", + " else x['valor_orcamento'], axis=1)\n", + "# Reorder columns\n", + "pr = pr.reindex(columns=ordem)\n", + "\n", + "# Save\n", + "pr.to_csv(os.path.join(path, \"output/licitacao_pr.csv\"), index=False, na_rep='', float_format='%.2f')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8_wCebaRN0nf" + }, + "source": [ + "## RS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "g9b36ufnDXkk" + }, + "outputs": [], + "source": [ + "# List municipalities to loop\n", + "\n", + "municipio_rs = municipio.query('sigla_uf==\"RS\"')\n", + "municipios_rs = municipio_rs['id_municipio'].tolist()\n", + "\n", + "# List variables to drop and rename\n", + "\n", + "rs_rename = {'CD_ORGAO':'orgao','ANO_LICITACAO':'ano','CD_TIPO_MODALIDADE':'modalidade','TP_OBJETO':'natureza_objeto','TP_LICITACAO':'tipo','TP_CARACTERISTICA_OBJETO':'contratacao',\n", + " 'TP_NATUREZA':'natureza_processo','TP_RESULTADO_GLOBAL':'situacao','DS_OBJETO':'descricao_objeto','VL_LICITACAO':'valor_orcamento', 'DT_ABERTURA':'data_abertura',\n", + " 'DT_HOMOLOGACAO':'data_homologacao','VL_HOMOLOGADO':'valor','BL_COVID19':'covid_19','CD_TIPO_FASE_ATUAL':'estagio','CD_MUNICIPIO_IBGE':'id_municipio',\n", + " 'NR_LICITACAO':'id_licitacao','ANO_PROCESSO':'ano_processo'}\n", + "\n", + "rs_drop = ['NR_COMISSAO','ANO_COMISSAO','TP_COMISSAO','NR_PROCESSO','TP_NIVEL_JULGAMENTO','DT_AUTORIZACAO_ADESAO','TP_REGIME_EXECUCAO','BL_PERMITE_SUBCONTRATACAO',\n", + " 'TP_FORNECIMENTO','TP_ATUACAO_REGISTRO','NR_LICITACAO_ORIGINAL','ANO_LICITACAO_ORIGINAL','NR_ATA_REGISTRO_PRECO','DT_ATA_REGISTRO_PRECO','PC_TAXA_RISCO','TP_EXECUCAO',\n", + " 'TP_DISPUTA','TP_PREQUALIFICACAO','BL_INVERSAO_FASES','CNPJ_ORGAO_GERENCIADOR','NM_ORGAO_GERENCIADOR','CD_TIPO_FUNDAMENTACAO','NR_ARTIGO','DS_INCISO','DS_LEI',\n", + " 'DT_INICIO_INSCR_CRED','DT_FIM_INSCR_CRED','DT_INICIO_VIGEN_CRED','DT_FIM_VIGEN_CRED','BL_ORCAMENTO_SIGILOSO','BL_RECEBE_INSCRICAO_PER_VIG','BL_PERMITE_CONSORCIO',\n", + " 'BL_LICIT_PROPRIA_ORGAO','TP_DOCUMENTO_FORNECEDOR','NR_DOCUMENTO_FORNECEDOR','TP_DOCUMENTO_VENCEDOR','NR_DOCUMENTO_VENCEDOR','BL_GERA_DESPESA','DS_OBSERVACAO',\n", + " 'PC_TX_ESTIMADA','PC_TX_HOMOLOGADA','DT_ADJUDICACAO','BL_COMPARTILHADA','LINK_LICITACON_CIDADAO']\n", + "\n", + "# Dictionary\n", + "\n", + "modalidade = {'CNV':'1','TMP':'2','CNC':'3','CCP':'3','CCE':'3','PRP':'5','PCP':'5','PRE':'6','PCE':'6','LEI':'7','LEE':'7',\n", + " 'PRD':'8','CPP':'8','PDE':'9','PRI':'10','CNS':'11','RDC':'12','RDE':'12', 'RIN':'29', 'CPC':'30','CHP':'30',\n", + " 'RPO':'','ESE':'', 'EST':''}\n", + "\n", + "contratacao = {'IT':'1','IU':'1','LT':'2','LU':'2'}\n", + "\n", + "tipo = {'MPR':'1','MDE':'1','MTC':'2','MPP':'2','TPR':'3','MLO':'4','MOP':'4','MRE':'5','MCA':'6','MOO':'7','MOQ':'7',\n", + " 'MOT':'8','MVT':'9','MTO':'10','MTT':'11','MTX':'12','NSA':'13'}\n", + "\n", + "estagio = {'INT':'1','EPU':'2','PUB':'2','HAP':'4','ADH':'6'}\n", + "\n", + "situacao = {'A':'1','R':'3','D':'4','F':'5','N':'6'}\n", + "\n", + "natureza_objeto = {'OSE':'1','CSE':'2','LOC':'3','COL':'4','PER':'5','ALB':'6','COM':'8','OUS':'9',\n", + " 'CON':'11','PRI':'13','PPP':'14','SAU':'15','INF':'16'}\n", + "\n", + "natureza_processo= {'N':'1','R':'2','O':'3'}\n", + "\n", + "covid_19 = {'S':'1','N':'0'}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "t2IL8c6rC-0M" + }, + "outputs": [], + "source": [ + "#licitacao.csv\n", + "\n", + "# RS folder\n", + "folder = os.path.join(path,\"input/RS\")\n", + "\n", + "anos_rs = ['2016','2017','2018','2019','2020','2021']\n", + "\n", + "# Initialize an empty list and loop through each CSV file\n", + "\n", + "all_df_rs=[]\n", + "for a in anos_rs:\n", + " df = os.path.join(folder, 'Licitacao/{}.csv.zip'.format(a,a))\n", + " with ZipFile(df) as z:\n", + " with z.open('licitacao.csv') as f:\n", + " rs = pd.read_csv(f,sep=',', encoding='utf-8', dtype=str)\n", + "\n", + " rs.drop(rs_drop, axis=1, inplace=True)\n", + "\n", + " # Merge to get id_municipio\n", + " rs = pd.merge(rs, orgao_municipio, how='left',left_on='CD_ORGAO', right_on='CD_ORGAO', indicator=True) # alguns ids missing\n", + "\n", + " rs = rs[rs['_merge']==\"both\"]\n", + "\n", + " rs.drop(['NOME_ORGAO','ESFERA','SIGLA_ORGAO','SETOR_GOVERNAMENTAL','CNPJ','NATUREZA_JURIDICA',\n", + " 'CONTABILIDADE','SITUACAO_ORGAO','CD_MUNICIPIO_TCERS','NOME_MUNICIPIO'], axis=1,inplace=True)\n", + "\n", + " rs.rename(rs_rename, axis=1, inplace=True)\n", + "\n", + " rs['exclusiva_micro_pequena'] = np.where(rs['TP_BENEFICIO_MICRO_EPP'] == \"L\", 1, 0) #L - licitacao exclusiva\n", + " rs['preferencia_micro_pequena'] = np.where((rs['TP_BENEFICIO_MICRO_EPP'] == \"T\") | (rs['TP_BENEFICIO_MICRO_EPP'] == \"S\"), 1, 0) # S - tratamento diferenciado ou subcontratacao\n", + "\n", + " rs.drop('TP_BENEFICIO_MICRO_EPP', axis=1, inplace=True)\n", + "\n", + " rs = rs.drop(rs[(rs['modalidade'] == \"MAI\")].index)\n", + "\n", + " # Piggyback procurement\n", + " # RPO - adesao a ata de registro de preco\n", + "\n", + " rs['carona'] = np.where(rs['modalidade'] == \"RPO\", 1, 0)\n", + "\n", + " rs['sigla_uf'] = \"RS\"\n", + "\n", + " # Create a unique identifier for each purchase\n", + " rs['id_licitacao_bd'] = rs['id_licitacao'] + rs['ano'] + rs['modalidade'] + rs['orgao'] + rs['sigla_uf']\n", + "\n", + " # Replace by dictionary\n", + " rs['modalidade'] = rs['modalidade'].replace(modalidade, regex=True)\n", + " rs['contratacao'] = rs['contratacao'].replace(contratacao, regex=True)\n", + " rs['tipo'] = rs['tipo'].replace(tipo, regex=True)\n", + " rs['estagio'] = rs['estagio'].replace(estagio, regex=True)\n", + " rs['situacao'] = rs['situacao'].replace(situacao, regex=True)\n", + " rs['natureza_objeto'] = rs['natureza_objeto'].replace(natureza_objeto, regex=True)\n", + " rs['natureza_processo'] = rs['natureza_processo'].replace(natureza_processo, regex=True)\n", + " rs['covid_19'] = rs['covid_19'].replace(covid_19, regex=True)\n", + "\n", + " # Format value columns\n", + " rs['valor_orcamento'] = rs['valor_orcamento'].astype(float)\n", + " rs['valor'] = rs['valor'].replace('#################', np.nan)\n", + " rs['valor'] = rs['valor'].astype(float)\n", + "\n", + " # Create valor_corrigido\n", + " # If both 'valor' and 'valor_orcamento' are not null, choose the smaller of the two values.\n", + " # If 'valor' is not null but 'valor_orcamento' is null, use 'valor'.\n", + " # If 'valor' is null but 'valor_orcamento' is not null, use 'valor_orcamento'\n", + "\n", + " rs['valor_corrigido'] = rs.apply(lambda x: min(x['valor'], x['valor_orcamento'])\n", + " if pd.notnull(x['valor']) and pd.notnull(x['valor_orcamento'])\n", + " else x['valor'] if pd.notnull(x['valor'])\n", + " else x['valor_orcamento'], axis=1)\n", + " all_df_rs.append(rs)\n", + "\n", + "# Concatenate all DataFrames in the list into a single DataFrame\n", + "rs = pd.concat(all_df_rs, ignore_index=True, sort=True)\n", + "\n", + "# Reorder columns\n", + "rs = rs.reindex(columns=ordem)\n", + "\n", + "#rs.query('ano!=ano_processo')\n", + "#11629 obs\n", + "\n", + "# Save\n", + "rs.to_csv(os.path.join(path, \"output/licitacao_rs.csv\"), index=False, na_rep='', float_format='%.2f')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ryA48jtBjxT4" + }, + "source": [ + "## PB" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3uA3ozRsjA6D" + }, + "outputs": [], + "source": [ + "# PB folder\n", + "folder = os.path.join(path,\"input/PB\")\n", + "\n", + "# Open files\n", + "\n", + "pb = pd.read_csv(os.path.join(folder, \"TCE-PB-Portal-Gestor-Licitacoes_Propostas.txt\"), sep=\"|\", encoding='utf-8', dtype=str)\n", + "\n", + "pb_drop = ['jurisdicionado_id','nome_jurisdicionado','nome_tipo_jurisdicionado','nome_tipo_administracao_jurisdicionado','nome_esfera_jurisdicionado',\n", + " 'nome_setor_atual_licitacao','nome_proponente','cpf_cnpj_proponente','valor_proposta',\t'situacao_proposta',\n", + " 'nome_estagio_processual_licitacao']\n", + "\n", + "pb_rename = {'data_homologacao_licitacao':'data_homologacao','cd_ugestora':'id_unidade_gestora',\n", + " 'objeto_licitacao':'descricao_objeto','nome_modalidade_licitacao':'modalidade','valor_estimado_licitacao':'valor_orcamento',\n", + " 'valor_licitado_licitacao':'valor','situacao_fracassada_licitacao':'situacao','protocolo_licitacao':'id_licitacao'}\n", + "\n", + "modalidade = {'Convite':'1','Tomada de Preços':'2','Concorrência':'3','Pregão Medida Provisória 1.047/21':'4',\n", + " 'Pregão Lei 14.133/21':'4','Pregão Presencial':'5','Pregão Eletrônico':'6','Leilão':'7',\n", + " 'Dispensa Art. 24 - Lei 8.666/93':'8','Dispensa COVID-19 Art. 4º da Lei 13.979/2020':'8', 'Dispensa Lei 14.133/21':'8',\n", + " 'Dispensa Medida Provisória 1.047/21':'8','Dispensada Art. 17 - Lei 8.666/93':'8','Inexigibilidade':'10',\n", + " '10 Lei 14.133/21':'10','Concurso':'11','RDC - Regime Diferenciado de Contratações Públicas':'12',\n", + " 'Licitação Internacional GN 2350-9':'29','Licitação Internacional GN 2349-9':'29',\n", + " 'Licitação Internacional Não Competitiva':'29','Chamada Pública':'31', 'Licitação da Lei Nº 13.303/2016':'',\n", + " 'Adesão a Ata de Registro de Preços':'', '10 Lei 14.133/21':'', ' Art. 29 ou 30':'',\n", + " 'Contratação Emergencial de Organização Art. 12, II da Lei Nº 9.454/2011':''}\n", + "\n", + "situacao = {'Não':'1','Sim':'5','GUARDA TEMPORÁRIA':''}\n", + "\n", + "pb.rename(pb_rename, axis=1, inplace=True)\n", + "\n", + "# nome_municipio - 50298 missing values, consórcios intermunicipais regionais: ex - Consórcio Intermunicipal de Saúde dos Municípios do Alto Sertão Paraibano ou esfera estadual\n", + "# print(pb['nome_municipio'].isna().sum())\n", + "pb = pb.dropna(subset=['nome_municipio'])\n", + "\n", + "# Create year from numero_licitacao\n", + "pb['ano'] = pb['numero_licitacao'].str[6:10]\n", + "pb['ano'] = pb['ano'].str.replace(\"2104\",\"2014\")\n", + "pb['ano'] = pb['ano'].str.replace(\"3014\",\"2014\")\n", + "\n", + "# Keep one row per tender\n", + "pb = pb[pb['situacao_proposta']==\"Vencedora\"]\n", + "pb['sigla_uf'] = \"PB\"\n", + "\n", + "pb.drop(pb_drop, axis=1, inplace=True)\n", + "\n", + "# Merge to get id_municipio\n", + "pb['nome_municipio'] = pb['nome_municipio'].str.replace('Santa Terezinha','Santa Teresinha')\n", + "pb['nome_municipio'] = pb['nome_municipio'].str.replace('Quixaba','Quixabá')\n", + "\n", + "pb = pd.merge(pb, municipio, how='left', left_on=['nome_municipio','sigla_uf'], right_on=['nome','sigla_uf'], indicator = True)\n", + "pb.drop(['nome','nome_municipio','id_municipio_6','id_municipio_tce'],axis=1, inplace=True)\n", + "\n", + "# Format\n", + "pb['id_licitacao'] = pb['id_licitacao'].str[5:]\n", + "pb['id_licitacao'] = pb['id_licitacao'].str.replace('/','')\n", + "pb['valor'] = pb['valor'].astype(float)\n", + "pb['valor_orcamento'] = pb['valor_orcamento'].astype(float)\n", + "\n", + "# Create a unique identifier for each purchase\n", + "pb['id_licitacao_bd'] = pb['id_licitacao'] + pb['id_unidade_gestora'] + pb['sigla_uf']\n", + "\n", + "# Piggyback procurement (adesao a ata de registro de preco)\n", + "pb['carona'] = np.where(pb['modalidade'] == \"Adesão a Ata de Registro de Preços\", 1, 0)\n", + "\n", + "# Covid 19 indicator\n", + "pb['covid_19'] = np.where((pb['modalidade'] == \"Dispensa COVID-19 (Art. 4º da Lei 13.979/2020)\") |\n", + " (pb['modalidade'] == \"Dispensa Medida Provisória 1.047/21\") |\n", + " (pb['modalidade'] == \"Pregão Medida Provisória 1.047/21\"), 1, 0)\n", + "\n", + "# Create valor_corrigido\n", + "# If both 'valor' and 'valor_orcamento' are not null, choose the smaller of the two values.\n", + "# If 'valor' is not null but 'valor_orcamento' is null, use 'valor'.\n", + "# If 'valor' is null but 'valor_orcamento' is not null, use 'valor_orcamento'\n", + "\n", + "pb['valor_corrigido'] = pb.apply(lambda x: min(x['valor'], x['valor_orcamento'])\n", + " if pd.notnull(x['valor']) and pd.notnull(x['valor_orcamento'])\n", + " else x['valor'] if pd.notnull(x['valor'])\n", + " else x['valor_orcamento'], axis=1)\n", + "\n", + "# Replace by dictionary\n", + "pb['modalidade'] = pb['modalidade'].str.replace('(','')\n", + "pb['modalidade'] = pb['modalidade'].str.replace(')','')\n", + "pb['modalidade'] = pb['modalidade'].str.rstrip()\n", + "pb['modalidade'] = pb['modalidade'].str.lstrip()\n", + "pb['modalidade'] = pb['modalidade'].replace(modalidade, regex=True)\n", + "pb['modalidade'] = pb['modalidade'].replace('10 Lei 14.133/21','10') #force this one\n", + "pb['situacao'] = pb['situacao'].replace(situacao, regex=True)\n", + "\n", + "# Format date\n", + "pb['data_homologacao']=pb['data_homologacao'].str[6:]+ '-' + pb['data_homologacao'].str[3:5] + '-' + pb['data_homologacao'].str[:2]\n", + "\n", + "# Drop duplicates: suppliers level to tender level\n", + "pb.drop_duplicates(subset=['id_licitacao', 'id_municipio','id_unidade_gestora', 'ano'], inplace=True)\n", + "\n", + "# Drop years after 2021\n", + "pb['ano'] = pb['ano'].astype(int)\n", + "pb = pb[pb['ano'] < 2022]\n", + "\n", + "# ~ 4593 obs onde ano ! = ano_homologacao\n", + "\n", + "# Reorder columns\n", + "pb = pb.reindex(columns=ordem)\n", + "\n", + "#Save\n", + "pb.to_csv(os.path.join(path, \"output/licitacao_pb.csv\"), index=False, na_rep='', float_format='%.2f')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ioVSmVwg7Q3b" + }, + "source": [ + "## Partition" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6H33roKBBxtx" + }, + "outputs": [], + "source": [ + "# List of UFs\n", + "ufs = ['CE', 'PE', 'MG', 'PR', 'RS', 'PB']\n", + "\n", + "# Loop over each UF\n", + "for uf in ufs:\n", + " # Load the corresponding CSV file for the UF\n", + " file_path = f'/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao_{uf.lower()}.csv'\n", + " df = pd.read_csv(file_path, dtype=str, encoding='utf-8')\n", + "\n", + " # Convert 'ano' column to integer\n", + " df['ano'] = df['ano'].astype(int)\n", + "\n", + " # Save cvs by year and state\n", + "\n", + " for ano in [*range(2012, 2022)]:\n", + " for uf in ufs:\n", + " if uf == 'CE' and ano in [*range(2009, 2022)]:\n", + " print(\"Particionando {} do CE\".format(ano))\n", + " df2 = df[df['ano'] == ano]\n", + " df2.drop(['ano', 'sigla_uf'], axis=1, inplace=True)\n", + " exec(\"df2.to_csv('/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao/ano={}/sigla_uf=CE/microdados.csv', index=False, encoding='utf-8', na_rep='', float_format='%.2f')\".format(ano))\n", + " if uf == 'PE' and ano in [*range(2012, 2022)]:\n", + " print(\"Particionando {} do PE\".format(ano))\n", + " df2 = df[df['ano'] == ano]\n", + " df2.drop(['ano', 'sigla_uf'], axis=1, inplace=True)\n", + " exec(\"df2.to_csv('/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao/ano={}/sigla_uf=PE/microdados.csv', index=False, encoding='utf-8', na_rep='', float_format='%.2f')\".format(ano))\n", + " if uf == 'MG' and ano in [*range(2014, 2022)]:\n", + " print(\"Particionando {} do MG\".format(ano))\n", + " df2 = df[df['ano'] == ano]\n", + " df2.drop(['ano', 'sigla_uf'], axis=1, inplace=True)\n", + " exec(\"df2.to_csv('/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao/ano={}/sigla_uf=MG/microdados.csv', index=False, encoding='utf-8', na_rep='', float_format='%.2f')\".format(ano))\n", + " if uf == 'PR' and ano in [*range(2013, 2022)]:\n", + " print(\"Particionando {} do PR\".format(ano))\n", + " df2 = df[df['ano'] == ano]\n", + " df2.drop(['ano', 'sigla_uf'], axis=1, inplace=True)\n", + " exec(\"df2.to_csv('/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao/ano={}/sigla_uf=PR/microdados.csv', index=False, encoding='utf-8', na_rep='', float_format='%.2f')\".format(ano))\n", + " if uf == 'RS' and ano in [*range(2016, 2022)]:\n", + " print(\"Particionando {} do RS\".format(ano))\n", + " df2 = df[df['ano'] == ano]\n", + " df2.drop(['ano', 'sigla_uf'], axis=1, inplace=True)\n", + " exec(\"df2.to_csv('/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao/ano={}/sigla_uf=RS/microdados.csv', index=False, encoding='utf-8', na_rep='', float_format='%.2f')\".format(ano))\n", + " if uf == 'PB' and ano in [*range(2014, 2022)]:\n", + " print(\"Particionando {} do PB\".format(ano))\n", + " df2 = df[df['ano'] == ano]\n", + " df2.drop(['ano', 'sigla_uf'], axis=1, inplace=True)\n", + " exec(\"df2.to_csv('/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao/ano={}/sigla_uf=PB/microdados.csv', index=False, encoding='utf-8', na_rep='', float_format='%.2f')\".format(ano))" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [ + "1XK1HKfylqDd", + "-jJ_uoQ5rYBy", + "hfvWowZ4mLlw", + "8_wCebaRN0nf", + "ryA48jtBjxT4" + ], + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/bases/world_wb_mides/code/licitacao_item.ipynb b/bases/world_wb_mides/code/licitacao_item.ipynb new file mode 100644 index 000000000..06f2af694 --- /dev/null +++ b/bases/world_wb_mides/code/licitacao_item.ipynb @@ -0,0 +1,1274 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "**Preamble**\n", + "* This code cleans the raw public procurement data obtained from the State Audit Courts (TCEs) of the following states: CE, MG, PR and RS.\n", + "* The final output of this code is the tender item table (*licitacao_item*), available at [basedosdados](https://basedosdados.org/dataset/d3874769-bcbd-4ece-a38a-157ba1021514?table=14c5d05b-9830-4710-b7ac-7e0ca1bf9d8b).\n", + "* Made by: Nathalia Sales" + ], + "metadata": { + "id": "4uD2kmzD7mHS" + } + }, + { + "cell_type": "code", + "source": [ + "# Connect to google drive\n", + "\n", + "from google.colab import drive\n", + "drive.mount('/content/gdrive')\n", + "\n", + "# Necessary packages\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "import glob\n", + "from zipfile import ZipFile\n", + "from datetime import datetime\n", + "\n", + "# Display options\n", + "\n", + "pd.set_option('display.max_columns', None)\n", + "pd.options.display.float_format = '{:.2f}'.format\n", + "\n", + "# Set directory\n", + "\n", + "path = '/content/gdrive/MyDrive/ComprasPublicas_Brasil'\n", + "\n", + "# Open some auxiliary files\n", + "\n", + "municipio = pd.read_csv(os.path.join(path, \"auxiliary_files/municipio.csv\"), encoding='utf-8', dtype=str)\n", + "\n", + "id_tce = pd.read_csv(os.path.join(path, \"input/PE/municipios.csv\"), encoding='latin-1',dtype=str,\n", + " usecols = ['CODIGOIBGE','CODIGO','UNIDADEFEDERATIVA'])\n", + "\n", + "id_tce.rename(columns={'CODIGOIBGE':'id_municipio', 'CODIGO':'id_municipio_tce', 'UNIDADEFEDERATIVA':'sigla_uf'}, inplace=True)\n", + "\n", + "# Merge both\n", + "municipio = pd.merge(municipio, id_tce, how='left', left_on=['id_municipio', 'sigla_uf'], right_on=['id_municipio', 'sigla_uf'])\n", + "\n", + "ug_id = pd.read_csv(os.path.join(path, \"auxiliary_files/ug_id_mg.csv\"), sep=',', dtype=str) # MG\n", + "\n", + "orgao_municipio = pd.read_csv(os.path.join(path, \"input/RS/orgaos_auditados_rs.csv\"), encoding='utf-8',dtype=str,\n", + " usecols=['CD_MUNICIPIO_IBGE', 'CD_ORGAO']) # RS\n", + "\n", + "# Create a list of UFs\n", + "ufs = municipio['sigla_uf'].unique().tolist()\n", + "\n", + "# Set columns order\n", + "\n", + "ordem = ['ano','sigla_uf','id_municipio','orgao','id_unidade_gestora', 'id_licitacao_bd','id_licitacao','id_dispensa','id_item_bd','id_item','descricao','numero', 'numero_lote',\n", + " 'unidade_medida','quantidade_cotada', 'valor_unitario_cotacao','quantidade','valor_unitario','valor_total', 'quantidade_proposta','valor_proposta','valor_vencedor',\n", + " 'nome_vencedor','documento']" + ], + "metadata": { + "id": "ng_hUtIp600a" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hbqe1K4Nt4Hr" + }, + "source": [ + "## CE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WFMprQZeObe5" + }, + "outputs": [], + "source": [ + "#CE\n", + "\n", + "# Get a list of all CSV files\n", + "\n", + "all_files_ce_itens = glob.glob(os.path.join(path,\"input/CE/Licitações/itens_licitacoes_*.csv\"))\n", + "all_files_ce_licitantes = glob.glob(os.path.join(path,\"input/CE/Licitações/licitantes_*.csv\"))\n", + "\n", + "# For items - Initialize an empty list and loop through each CSV file\n", + "\n", + "all_df_ce = []\n", + "for f in all_files_ce_itens:\n", + " df1 = pd.read_csv(f, sep=';', dtype=str, encoding='latin-1')\n", + " df1['arquivo'] = f.split('/')[-1]\n", + " all_df_ce.append(df1)\n", + "\n", + "# Concatenate all DataFrames in the list into a single DataFrame\n", + "\n", + "ce1 = pd.concat(all_df_ce, ignore_index=True, sort=True)\n", + "\n", + "# Extract the year from the file name\n", + "\n", + "ce1['ano'] = ce1['arquivo'].str[17:21]\n", + "\n", + "# Drop and rename original variables\n", + "\n", + "ce1_drop = ['codigo_tipo_negociante','arquivo']\n", + "\n", + "ce1.drop(ce1_drop, axis=1, inplace=True)\n", + "\n", + "ce1_rename = {'numero_licitacao':'id_licitacao','descricao_item_licitacao':'descricao',\n", + " 'descricao_unidade_item_licitacao':'unidade_medida','valor_vencedor_item_licitacao':'valor_total',\n", + " 'valor_unitario_item_licitacao ':'valor_unitario','numero_quantidade_item_licitacao':'quantidade',\n", + " 'numero_documento_negociante':'documento','numero_sequencial_item_licitacao':'numero'}\n", + "\n", + "ce1.rename(ce1_rename, axis=1, inplace=True)\n", + "\n", + "# Merge between codigo_municipio and id_municipio_ibge\n", + "\n", + "id_mun = pd.read_csv(os.path.join(path,\"input/CE/municipios.csv\"), sep=';', dtype=str, encoding='latin-1',\n", + " usecols=['geoibgeId','codigo_municipio'])\n", + "\n", + "id_mun.rename({'geoibgeId':'id_municipio'}, axis=1, inplace=True)\n", + "\n", + "ce1 = pd.merge(ce1, id_mun, how='left', left_on='codigo_municipio', right_on='codigo_municipio')\n", + "\n", + "# Format document number\n", + "ce1['documento'] = ce1['documento'].astype(str)\n", + "ce1['documento']=ce1['documento'].str.strip()\n", + "\n", + "# For suppliers - Initialize an empty list and loop through each CSV file\n", + "\n", + "all_df_ce = []\n", + "for f in all_files_ce_licitantes:\n", + " df1 = pd.read_csv(f, sep=';', dtype=str, encoding='latin-1')\n", + " df1['arquivo'] = f.split('/')[-1]\n", + " all_df_ce.append(df1)\n", + "\n", + "# Concatenate all DataFrames in the list into a single DataFrame\n", + "\n", + "ce2 = pd.concat(all_df_ce, ignore_index=True, sort=True)\n", + "\n", + "# Extract the year from the file name\n", + "\n", + "ce2['ano'] = ce2['arquivo'].str[11:15]\n", + "\n", + "# Drop and rename original variables\n", + "\n", + "ce2_drop = ['fone_negociante', 'codigo_tipo_negociante',\n", + " 'endereco_negociante','cep_negociante', 'nome_municipio_negociante', 'arquivo',\n", + " 'codigo_uf ']\n", + "\n", + "ce2.drop(ce2_drop, axis=1, inplace=True)\n", + "\n", + "ce2_rename = {'numero_licitacao':'id_licitacao', 'nome_negociante':'nome_vencedor',\n", + " 'numero_documento_negociante':'documento'}\n", + "\n", + "ce2.rename(ce2_rename, axis=1, inplace=True)\n", + "\n", + "# Format document number\n", + "\n", + "ce2['documento'] = ce2['documento'].astype(str)\n", + "ce2['documento']=ce2['documento'].str.strip()\n", + "\n", + "# Merge items (ce1) and suppliers files (ce2) to get suppliers name\n", + "\n", + "ce = pd.merge(ce1, ce2, how='left', left_on=['codigo_municipio','ano','id_licitacao','documento','data_realizacao_licitacao'],\n", + " right_on=['codigo_municipio','ano','id_licitacao','documento','data_realizacao_licitacao'])\n", + "\n", + "# Date format\n", + "\n", + "ce['data_realizacao_licitacao'] = ce['data_realizacao_licitacao'].str[:10]\n", + "\n", + "# Assign state acronym to the 'sigla_uf'\n", + "\n", + "ce['sigla_uf']='CE'\n", + "\n", + "# Create a unique identifier for each purchase\n", + "\n", + "ce['id_licitacao_bd'] = ce['id_licitacao'] + ce['id_municipio'] + ce['ano'].str[2:4] + ce['sigla_uf']\n", + "\n", + "# Some id_licitacao_bd have two rows - Import tender (licitacao) table to verify\n", + "\n", + "ce3 = pd.read_csv(os.path.join(path, \"output/licitacao_ce.csv\"), dtype=str, encoding='utf-8',\n", + " usecols=['id_municipio','ano','id_licitacao','id_licitacao_bd'])\n", + "\n", + "ce3.rename({'id_licitacao_bd':'id_licitacao_bd_2'}, axis=1, inplace=True)\n", + "\n", + "# Merge with licitacao\n", + "# If the id is different in the two tables, input missing\n", + "\n", + "ce = pd.merge(ce, ce3, how='left', left_on=['id_municipio','ano','id_licitacao'],\n", + " right_on=['id_municipio','ano','id_licitacao'], indicator=True)\n", + "\n", + "ce['id_licitacao_bd'] = np.where((ce['_merge']==\"both\") & (ce['id_licitacao_bd'] != ce['id_licitacao_bd_2']), np.nan, ce['id_licitacao_bd'])\n", + "\n", + "# Drop non-necessary variable\n", + "ce.drop('data_realizacao_licitacao', axis=1, inplace=True)\n", + "\n", + "# Create a unique identifier for each item\n", + "ce['id_item'] = ce['numero'].astype(str) + ce['id_municipio'] + ce['id_licitacao'] + ce['ano'].str[2:4]\n", + "\n", + "#0.70% duplicates in id_item\n", + "#0.11% duplicates - when id_licitacao_bd not missing\n", + "\n", + "# Create a unique identifier for each item across states\n", + "ce['id_item_bd'] = ce['id_item'] + ce['sigla_uf']\n", + "\n", + "# Drop duplicated in all variables, few cases\n", + "ce.drop_duplicates(inplace=True)\n", + "\n", + "# We can have duplicates for items supplied by different suppliers\n", + "# When the same id_item have different descriptions, or same description but different quantity/value, replace duplicates by missing\n", + "\n", + "ce['id_item_bd'] = np.where((ce.duplicated(['id_item_bd'], keep=False)) & (~ce.duplicated(['id_item_bd', 'descricao'], keep=False)), np.nan, ce['id_item_bd'])\n", + "ce['id_item_bd'] = np.where((ce.duplicated(['id_item_bd','descricao'], keep=False)) & (~ce.duplicated(['id_item_bd', 'descricao','quantidade'], keep=False)), np.nan, ce['id_item_bd'])\n", + "ce['id_item_bd'] = np.where((ce.duplicated(['id_item_bd','descricao','quantidade'], keep=False)) & (~ce.duplicated(['id_item_bd', 'descricao','quantidade','valor_total'], keep=False)), np.nan, ce['id_item_bd'])\n", + "\n", + "# Format\n", + "\n", + "ce['ano'] = ce['ano'].astype(int)\n", + "ce['numero'] = ce['numero'].astype(int)\n", + "ce['quantidade'] = pd.to_numeric(ce['quantidade'] , errors='coerce').fillna(0).astype(int)\n", + "ce['quantidade'] = ce['quantidade'].replace(0,'')\n", + "\n", + "floats = ['valor_unitario','valor_total']\n", + "ce[floats] = ce[floats].astype(float)\n", + "\n", + "ce['length'] = ce['documento'].str.len()\n", + "ce['documento'] = np.where((ce['length'] == 13) & (ce['nome_vencedor']!=\"LUIZA DA SILVA LIMA - ME\"), ce['documento'].str.zfill(14), ce['documento'])\n", + "\n", + "# Drop LUIZA DA SILVA LIMA - ME because when adding the zero to the left, it returns the cnpj of another company, according to google\n", + "ce['documento'] = np.where((ce['length'] == 15), ce['documento'].str[1:], ce['documento'])\n", + "ce['documento'] = np.where((ce['length'] == 13) & (ce['nome_vencedor']!=\"LUIZA DA SILVA LIMA - ME\"), ce['documento'].str.zfill(14), ce['documento'])\n", + "\n", + "#duplicated\n", + "ce['nome_vencedor'] = ce['nome_vencedor'].replace('JOSÉ LUS DE LIMA - ME', 'JOSÉ LUIS DE LIMA - ME')\n", + "\n", + "# Drop non-necessary variable\n", + "ce.drop('length', axis=1, inplace=True)\n", + "\n", + "# Drop duplicated in all variables, after changes in document\n", + "ce.drop_duplicates(inplace=True)\n", + "\n", + "#0.20% duplicates in id_item - all of them when id_licitacao_bd is missing\n", + "\n", + "# Reorder columns\n", + "ce = ce.reindex(columns=ordem)\n", + "\n", + "# Save\n", + "ce.to_csv(os.path.join(path, \"output/licitacao_item_ce.csv\"), index=False, na_rep='', float_format='%.2f')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "B5DfSEqUrWqH" + }, + "source": [ + "## MG" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lY5FC-_5cKjr" + }, + "outputs": [], + "source": [ + "# List municipalities to loop\n", + "\n", + "municipio_mg = municipio.query('sigla_uf==\"MG\"')\n", + "municipios_mg = municipio_mg['id_municipio'].tolist()\n", + "\n", + "ug_id1 = ug_id.query('modalidade != \"8\" & modalidade !=\"10\"')\n", + "ug_id2 = ug_id.query('modalidade == \"8\" | modalidade ==\"10\"')\n", + "\n", + "# Rename and list variables to drop from different files\n", + "\n", + "# Competitive procurement itens\n", + "\n", + "mg1_rename = {'seq_item_licitacao':'id_item','seq_licitacao':'id_licitacao','seq_orgao':'orgao',\n", + " 'num_ano_referencia':'ano', 'dsc_unid_medida':'unidade_medida', 'dsc_item':'descricao',\n", + " 'num_lote':'numero_lote','num_item':'numero'}\n", + "\n", + "mg1_drop = ['num_mes_referencia','num_versao_arq','cod_item','dsc_lote']\n", + "\n", + "# Items price quotation\n", + "\n", + "mg2_rename = {'seq_item_licitacao':'id_item','seq_licitacao':'id_licitacao','seq_orgao':'orgao',\n", + " 'num_ano_referencia':'ano', 'vlr_cot_preco_unit':'valor_unitario_cotacao',\n", + " 'num_quant_item_cotado':'quantidade_cotada'}\n", + "\n", + "mg2_drop = ['seq_cot_licitacao','num_mes_referencia', 'dat_cotacao', 'vlr_percentual',\n", + " 'vlr_min_alien_bens', 'num_versao_arq']\n", + "\n", + "# Items with reference price\n", + "\n", + "mg3_rename = {'seq_item_licitacao':'id_item','seq_licitacao':'id_licitacao','seq_orgao':'orgao',\n", + " 'num_ano_referencia':'ano', 'vlr_item':'valor_unitario_cotacao'}\n", + "\n", + "mg3_drop = ['seq_cred_licitacao','num_mes_referencia', 'num_versao_arq']\n", + "\n", + "# Homologated items (suppliers)\n", + "\n", + "mg4_rename = {'seq_item_licitacao':'id_item','seq_licitacao':'id_licitacao','seq_orgao':'orgao',\n", + " 'num_ano_referencia':'ano', 'vlr_unitario':'valor_unitario', 'num_quant_item':'quantidade',\n", + " 'num_doc_vencedor':'documento', 'nom_vencedor':'nome_vencedor'}\n", + "\n", + "mg4_drop = ['seq_hom_licitacao','num_mes_referencia', 'num_versao_arq', 'vlr_pct_desconto',\n", + " 'vlr_pct_tax_adm', 'vlr_global', 'num_versao_arq']\n", + "\n", + "# Non competitive procurement (dispensa/inexibilidade)\n", + "\n", + "mg5_rename = {'seq_item_dispensa':'id_item','seq_dispensa':'id_dispensa','seq_orgao':'orgao',\n", + " 'num_ano_referencia':'ano', 'dsc_unid_medida':'unidade_medida', 'dsc_item':'descricao',\n", + " 'num_item':'numero'}\n", + "\n", + "mg5_drop = ['num_mes_referencia','num_versao_arq','cod_item']\n", + "\n", + "# Items price quotation\n", + "\n", + "mg6_rename = {'seq_item_dispensa':'id_item','seq_dispensa':'id_dispensa','seq_orgao':'orgao',\n", + " 'num_ano_referencia':'ano', 'vlr_preco_unit':'valor_unitario_cotacao',\n", + " 'num_quant_item':'quantidade_cotada'}\n", + "\n", + "mg6_drop = ['seq_cot_dispensa','num_mes_referencia', 'num_versao_arq']\n", + "\n", + "# Suppliers\n", + "\n", + "mg7_rename = {'seq_item_dispensa':'id_item','seq_dispensa':'id_dispensa','seq_orgao':'orgao',\n", + " 'num_ano_referencia':'ano', 'vlr_item':'valor_unitario', 'num_quant_item':'quantidade',\n", + " 'num_doc_fornecedor':'documento', 'dsc_nom_fornecedor':'nome_vencedor'}\n", + "\n", + "mg7_drop = ['seq_forn_dispensa','num_inscr_estadual', 'dsc_sigla_uf','num_mes_referencia',\n", + " 'num_certidao_inss', 'dat_emi_cert_inss', 'dat_emi_cert_fgts', 'dat_val_cert_fgts',\n", + " 'dat_val_cert_inss', 'num_cert_fgts', 'num_cndt','dat_emi_cndt','dat_val_cndt',\n", + " 'num_versao_arq']\n", + "\n", + "# MG folder\n", + "folder = os.path.join(path,\"input/MG\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZhbCKf4duBz7" + }, + "outputs": [], + "source": [ + "anos_mg = ['2014','2015','2016']\n", + "\n", + "all_df_mg=[]\n", + "for a in anos_mg:\n", + " for m in municipios_mg:\n", + " df = os.path.join(folder, '{}/licitacao_{}.zip'.format(a,a))\n", + " with ZipFile(df) as z:\n", + "\n", + " # Competitive procurement itens\n", + " with z.open('licitacao/{}/{}.{}.licitacao.itemLicitacao.csv'.format(m,a,m)) as f:\n", + " mg1 = pd.read_csv(f,sep=';', encoding='utf-8', dtype=str)\n", + " mg1['id_municipio'] = m\n", + " mg1.rename(columns=mg1_rename, inplace=True)\n", + " mg1.drop(mg1_drop, axis=1, inplace=True)\n", + " mg1 = mg1.reindex(columns=['id_municipio','id_item','id_licitacao','ano','orgao','descricao','unidade_medida','numero','numero_lote'])\n", + "\n", + " # Quoted items\n", + " with z.open('licitacao/{}/{}.{}.licitacao.cotacaoLicitacao.csv'.format(m,a,m)) as f:\n", + " mg2 = pd.read_csv(f,sep=';', encoding='utf-8', dtype=str)\n", + " mg2['id_municipio'] = m\n", + " mg2.rename(columns=mg2_rename, inplace=True)\n", + " mg2.drop(mg2_drop, axis=1, inplace=True)\n", + " mg2 = mg2.reindex(columns=['id_municipio','id_item','id_licitacao','ano','orgao','quantidade_cotada', 'valor_unitario_cotacao'])\n", + "\n", + " # Items with reference price\n", + " with z.open('licitacao/{}/{}.{}.licitacao.refLicitacao.csv'.format(m,a,m)) as f:\n", + " mg3 = pd.read_csv(f,sep=';', encoding='utf-8', dtype=str)\n", + " mg3['id_municipio'] = m\n", + " mg3.rename(columns=mg3_rename, inplace=True)\n", + " mg3.drop(mg3_drop, axis=1, inplace=True)\n", + " mg3 = mg3.reindex(columns=['id_municipio','id_item','id_licitacao','ano','orgao','valor_unitario_cotacao'])\n", + "\n", + " # Homologated items (suppliers)\n", + " with z.open('licitacao/{}/{}.{}.licitacao.homologLicitacao.csv'.format(m,a,m)) as f:\n", + " mg4 = pd.read_csv(f,sep=';', encoding='utf-8', dtype=str)\n", + " mg4['id_municipio'] = m\n", + " mg4.rename(columns=mg4_rename, inplace=True)\n", + " mg4.drop(mg4_drop, axis=1, inplace=True)\n", + " mg4 = mg4.reindex(columns=['id_municipio','id_item','id_licitacao','ano','orgao','quantidade','valor_unitario','nome_vencedor','documento'])\n", + "\n", + " # Non competitive procurement (dispensa/inexibilidade)\n", + " with z.open('licitacao/{}/{}.{}.licitacao.itemDispensa.csv'.format(m,a,m)) as f:\n", + " mg5 = pd.read_csv(f,sep=';', encoding='utf-8', dtype=str)\n", + " mg5['id_municipio'] = m\n", + " mg5.rename(columns=mg5_rename, inplace=True)\n", + " mg5.drop(mg5_drop, axis=1, inplace=True)\n", + " mg5 = mg5.reindex(columns=['id_municipio','id_item','id_dispensa','ano','orgao','descricao','unidade_medida','numero'])\n", + "\n", + " # Quoted items\n", + " with z.open('licitacao/{}/{}.{}.licitacao.cotDispensa.csv'.format(m,a,m)) as f:\n", + " mg6 = pd.read_csv(f,sep=';', encoding='utf-8', dtype=str)\n", + " mg6['id_municipio'] = m\n", + " mg6.rename(columns=mg6_rename, inplace=True)\n", + " mg6.drop(mg6_drop, axis=1, inplace=True)\n", + " mg6 = mg6.reindex(columns=['id_municipio','id_item','id_dispensa','ano','orgao','quantidade_cotada', 'valor_unitario_cotacao'])\n", + "\n", + " # Suppliers\n", + " with z.open('licitacao/{}/{}.{}.licitacao.fornDispensa.csv'.format(m,a,m)) as f:\n", + " mg7 = pd.read_csv(f,sep=';', encoding='utf-8', dtype=str)\n", + " mg7['id_municipio'] = m\n", + " mg7.rename(columns=mg7_rename, inplace=True)\n", + " mg7.drop(mg7_drop, axis=1, inplace=True)\n", + " mg7 = mg7.reindex(columns=['id_municipio','id_item','id_dispensa','ano','orgao','quantidade','valor_unitario','nome_vencedor','documento'])\n", + "\n", + " # Merge competitive procurement\n", + "\n", + " # First - merge quoted items with items with reference price\n", + " merge1 = pd.merge(mg2, mg3, how='outer', left_on=['id_municipio','id_item','id_licitacao','ano','orgao','valor_unitario_cotacao'],\n", + " right_on=['id_municipio','id_item','id_licitacao','ano','orgao','valor_unitario_cotacao'])\n", + "\n", + " # Second - merge items general information with their quoted or reference price\n", + " merge2 = pd.merge(mg1, merge1 ,how='left', left_on=['id_municipio','id_item','id_licitacao','ano','orgao'],\n", + " right_on=['id_municipio','id_item','id_licitacao','ano','orgao'])\n", + "\n", + " # Third - merge with homologated items to get suppliers\n", + " mg_licitacao_1= pd.merge(merge2, mg4 ,how='left', left_on=['id_municipio','id_item','id_licitacao','ano','orgao'],\n", + " right_on=['id_municipio','id_item','id_licitacao','ano','orgao'])\n", + "\n", + " mg_licitacao_1['documento']=mg_licitacao_1['documento'].str.strip()\n", + "\n", + " # Merge to get id_unidade_gestora\n", + " mg_licitacao_1 = pd.merge(mg_licitacao_1, ug_id1, how='left', left_on=['ano','id_municipio','id_licitacao'],\n", + " right_on=['ano','id_municipio','id_licitacao'])\n", + "\n", + " # Assign state acronym to the 'sigla_uf'\n", + " mg_licitacao_1['sigla_uf'] = \"MG\"\n", + "\n", + " # Create a unique identifier for each purchase, as in licitacao table\n", + " mg_licitacao_1['id_licitacao_bd'] = mg_licitacao_1['id_licitacao'] + mg_licitacao_1['id_unidade_gestora'] + mg_licitacao_1['sigla_uf']\n", + "\n", + " # Keep only necessary variables\n", + " mg_licitacao_1 = mg_licitacao_1.reindex(columns=ordem)\n", + "\n", + " # Merge non competitive procurement\n", + "\n", + " # First - merge items general information with their quoted price\n", + " merge3 = pd.merge(mg5, mg6, how='outer', left_on=['id_municipio','id_item','id_dispensa','ano','orgao'],\n", + " right_on=['id_municipio','id_item','id_dispensa','ano','orgao'])\n", + "\n", + " # Second - merge with homologated items to get suppliers\n", + " mg_dispensa_1 = pd.merge(merge3, mg7 ,how='left', left_on=['id_municipio','id_item','id_dispensa','ano','orgao'],\n", + " right_on=['id_municipio','id_item','id_dispensa','ano','orgao'])\n", + "\n", + " mg_dispensa_1['documento']=mg_dispensa_1['documento'].str.strip()\n", + "\n", + " # Merge to get id_unidade_gestora\n", + " mg_dispensa_1 = pd.merge(mg_dispensa_1, ug_id2, how='left', left_on=['ano','id_municipio','id_dispensa'],\n", + " right_on=['ano','id_municipio','id_dispensa'])\n", + "\n", + " # Assign state acronym to the 'sigla_uf'\n", + " mg_dispensa_1['sigla_uf'] = \"MG\"\n", + "\n", + " # Create a unique identifier for each purchase, as in licitacao table\n", + " mg_dispensa_1['id_licitacao_bd'] = mg_dispensa_1['id_dispensa'] + mg_dispensa_1['id_unidade_gestora'] + mg_dispensa_1['sigla_uf']\n", + "\n", + " # Keep only necessary variables\n", + " mg_dispensa_1 = mg_dispensa_1.reindex(columns=ordem)\n", + "\n", + " # Append competitive and non competitive\n", + " mg_item_1 = mg_licitacao_1.append([mg_dispensa_1],ignore_index=True)\n", + "\n", + " # Variables format\n", + " floats = ['valor_unitario_cotacao', 'valor_proposta', 'valor_unitario','valor_total',\n", + " 'quantidade','quantidade_cotada']\n", + "\n", + " mg_item_1[floats] = mg_item_1[floats].astype(float)\n", + "\n", + " strings= ['id_licitacao','id_dispensa','id_unidade_gestora']\n", + " mg_item_1[strings] = mg_item_1[strings].astype(str)\n", + "\n", + " mg_item_1['numero_lote'] = mg_item_1['numero_lote'].replace(\"-1\",np.nan)\n", + " mg_item_1['id_licitacao'] = mg_item_1['id_licitacao'].replace(\"nan\",np.nan)\n", + " mg_item_1['id_dispensa'] = mg_item_1['id_dispensa'].replace(\"nan\",np.nan)\n", + "\n", + " # Create total value\n", + "\n", + " mg_item_1['valor_total'] = np.where((mg_item_1['valor_unitario'].notnull()) | (mg_item_1['quantidade'].notnull()),\n", + " mg_item_1['quantidade'] * mg_item_1['valor_unitario'] , np.nan)\n", + "\n", + " # Create a unique identifier for each item\n", + " mg_item_1['id_item_bd'] = mg_item_1['id_item'] + mg_item_1['id_unidade_gestora'] + mg_item_1['sigla_uf']\n", + "\n", + " # Duplicates only allowed for items supplied by different suppliers\n", + " mg_item_1['id_item_bd'] = np.where((mg_item_1.duplicated(['id_item_bd'], keep=False)) & (~mg_item_1.duplicated(['id_item_bd', 'descricao'], keep=False)), np.nan, mg_item_1['id_item_bd'])\n", + "\n", + " # Drop duplicates in all variables\n", + " mg_item_1.drop_duplicates(inplace=True)\n", + "\n", + " # Reorder columns\n", + " mg_item_1 = mg_item_1.reindex(columns=ordem)\n", + "\n", + " # Partition by year and municipality\n", + "\n", + " mg_item_1.drop(['ano','sigla_uf'],axis=1,inplace=True)\n", + "\n", + " exec(\"mg_item_1.to_csv('/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao_item/ano={}/sigla_uf=MG/municipio_{}.csv', index=False, encoding='utf-8', na_rep='', float_format='%.2f')\".format(a,m))\n", + "\n", + " # Append all\n", + " mg_item_1['ano'] = a\n", + " mg_item_1['sigla_uf'] = \"MG\"\n", + "\n", + " all_df_mg.append(mg_item_1)\n", + "\n", + "mg_item_1 = pd.concat(all_df_mg, ignore_index=True, sort=True)\n", + "\n", + "mg_item_1.to_csv(os.path.join(path, \"output/temp/mg_item_1.csv\"), index=False, na_rep='', float_format='%.2f')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jC8QHVa_5gvm" + }, + "outputs": [], + "source": [ + "anos_mg = ['2017']\n", + "\n", + "all_df_mg=[]\n", + "\n", + "for a in anos_mg:\n", + " for m in municipios_mg:\n", + " df = os.path.join(folder, '{}/licitacao_{}.zip'.format(a,a))\n", + " with ZipFile(df) as z:\n", + "\n", + " # Competitive procurement itens\n", + " with z.open('{}/licitacao/{}/{}.{}.licitacao.itemLicitacao.csv'.format(a,m,a,m)) as f:\n", + " mg1 = pd.read_csv(f,sep=';', encoding='utf-8', dtype=str)\n", + " mg1['id_municipio'] = m\n", + " mg1.rename(columns=mg1_rename, inplace=True)\n", + " mg1.drop(mg1_drop, axis=1, inplace=True)\n", + " mg1 = mg1.reindex(columns=['id_municipio','id_item','id_licitacao','ano','orgao','descricao','unidade_medida','numero','numero_lote'])\n", + "\n", + " # Quoted itens\n", + " with z.open('{}/licitacao/{}/{}.{}.licitacao.cotacaoLicitacao.csv'.format(a,m,a,m)) as f:\n", + " mg2 = pd.read_csv(f,sep=';', encoding='utf-8', dtype=str)\n", + " mg2['id_municipio'] = m\n", + " mg2.rename(columns=mg2_rename, inplace=True)\n", + " mg2.drop(mg2_drop, axis=1, inplace=True)\n", + " mg2 = mg2.reindex(columns=['id_municipio','id_item','id_licitacao','ano','orgao','quantidade_cotada', 'valor_unitario_cotacao'])\n", + "\n", + " # Items with reference price\n", + " with z.open('{}/licitacao/{}/{}.{}.licitacao.refLicitacao.csv'.format(a,m,a,m)) as f:\n", + " mg3 = pd.read_csv(f,sep=';', encoding='utf-8', dtype=str)\n", + " mg3['id_municipio'] = m\n", + " mg3.rename(columns=mg3_rename, inplace=True)\n", + " mg3.drop(mg3_drop, axis=1, inplace=True)\n", + " mg3 = mg3.reindex(columns=['id_municipio','id_item','id_licitacao','ano','orgao','valor_unitario_cotacao'])\n", + "\n", + " # Homologated itens (suppliers)\n", + " with z.open('{}/licitacao/{}/{}.{}.licitacao.homologLicitacao.csv'.format(a,m,a,m)) as f:\n", + " mg4 = pd.read_csv(f,sep=';', encoding='utf-8', dtype=str)\n", + " mg4['id_municipio'] = m\n", + " mg4.rename(columns=mg4_rename, inplace=True)\n", + " mg4.drop(mg4_drop, axis=1, inplace=True)\n", + " mg4 = mg4.reindex(columns=['id_municipio','id_item','id_licitacao','ano','orgao','quantidade','valor_unitario','nome_vencedor','documento'])\n", + "\n", + " # Non competitive procurement (dispensa/inexibilidade)\n", + " with z.open('{}/licitacao/{}/{}.{}.licitacao.itemDispensa.csv'.format(a,m,a,m)) as f:\n", + " mg5 = pd.read_csv(f,sep=';', encoding='utf-8', dtype=str)\n", + " mg5['id_municipio'] = m\n", + " mg5.rename(columns=mg5_rename, inplace=True)\n", + " mg5.drop(mg5_drop, axis=1, inplace=True)\n", + " mg5 = mg5.reindex(columns=['id_municipio','id_item','id_dispensa','ano','orgao','descricao','unidade_medida','numero'])\n", + "\n", + " # Quoted items\n", + " with z.open('{}/licitacao/{}/{}.{}.licitacao.cotDispensa.csv'.format(a,m,a,m)) as f:\n", + " mg6 = pd.read_csv(f,sep=';', encoding='utf-8', dtype=str)\n", + " mg6['id_municipio'] = m\n", + " mg6.rename(columns=mg6_rename, inplace=True)\n", + " mg6.drop(mg6_drop, axis=1, inplace=True)\n", + " mg6 = mg6.reindex(columns=['id_municipio','id_item','id_dispensa','ano','orgao','quantidade_cotada', 'valor_unitario_cotacao'])\n", + "\n", + " # Suppliers\n", + " with z.open('{}/licitacao/{}/{}.{}.licitacao.fornDispensa.csv'.format(a,m,a,m)) as f:\n", + " mg7 = pd.read_csv(f,sep=';', encoding='utf-8', dtype=str)\n", + " mg7['id_municipio'] = m\n", + " mg7.rename(columns=mg7_rename, inplace=True)\n", + " mg7.drop(mg7_drop, axis=1, inplace=True)\n", + " mg7 = mg7.reindex(columns=['id_municipio','id_item','id_dispensa','ano','orgao','quantidade','valor_unitario','nome_vencedor','documento'])\n", + "\n", + " # Merge competitive procurement\n", + "\n", + " # First - merge quoted items with items with reference price\n", + " merge1 = pd.merge(mg2, mg3, how='outer', left_on=['id_municipio','id_item','id_licitacao','ano','orgao','valor_unitario_cotacao'],\n", + " right_on=['id_municipio','id_item','id_licitacao','ano','orgao','valor_unitario_cotacao'])\n", + "\n", + " # Second - merge items general information with their quoted or reference price\n", + " merge2 = pd.merge(mg1, merge1 ,how='left', left_on=['id_municipio','id_item','id_licitacao','ano','orgao'],\n", + " right_on=['id_municipio','id_item','id_licitacao','ano','orgao'])\n", + "\n", + " # Third - merge with homologated items to get suppliers\n", + " mg_licitacao_1= pd.merge(merge2, mg4 ,how='left', left_on=['id_municipio','id_item','id_licitacao','ano','orgao'],\n", + " right_on=['id_municipio','id_item','id_licitacao','ano','orgao'])\n", + "\n", + " mg_licitacao_1['documento']=mg_licitacao_1['documento'].str.strip()\n", + "\n", + " # Merge to get id_unidade_gestora\n", + " mg_licitacao_1 = pd.merge(mg_licitacao_1, ug_id1, how='left', left_on=['ano','id_municipio','id_licitacao'],\n", + " right_on=['ano','id_municipio','id_licitacao'])\n", + "\n", + " # Assign state acronym to the 'sigla_uf'\n", + " mg_licitacao_1['sigla_uf'] = \"MG\"\n", + "\n", + " # Create a unique identifier for each purchase, as in licitacao table\n", + " mg_licitacao_1['id_licitacao_bd'] = mg_licitacao_1['id_licitacao'] + mg_licitacao_1['id_unidade_gestora'] + mg_licitacao_1['sigla_uf']\n", + "\n", + " # Keep only necessary variables\n", + " mg_licitacao_1 = mg_licitacao_1.reindex(columns=ordem)\n", + "\n", + " # Merge non competitive procurement\n", + "\n", + " # First - merge items general information with their quoted price\n", + " merge3 = pd.merge(mg5, mg6, how='outer', left_on=['id_municipio','id_item','id_dispensa','ano','orgao'],\n", + " right_on=['id_municipio','id_item','id_dispensa','ano','orgao'])\n", + "\n", + " # Second - merge with homologated items to get suppliers\n", + " mg_dispensa_1 = pd.merge(merge3, mg7 ,how='left', left_on=['id_municipio','id_item','id_dispensa','ano','orgao'],\n", + " right_on=['id_municipio','id_item','id_dispensa','ano','orgao'])\n", + "\n", + " mg_dispensa_1['documento']=mg_dispensa_1['documento'].str.strip()\n", + "\n", + " # Merge to get id_unidade_gestora\n", + " mg_dispensa_1 = pd.merge(mg_dispensa_1, ug_id2, how='left', left_on=['ano','id_municipio','id_dispensa'],\n", + " right_on=['ano','id_municipio','id_dispensa'])\n", + "\n", + " # Assign state acronym to the 'sigla_uf'\n", + " mg_dispensa_1['sigla_uf'] = \"MG\"\n", + "\n", + " # Create a unique identifier for each purchase, as in licitacao table\n", + " mg_dispensa_1['id_licitacao_bd'] = mg_dispensa_1['id_dispensa'] + mg_dispensa_1['id_unidade_gestora'] + mg_dispensa_1['sigla_uf']\n", + "\n", + " # Keep only necessary variables\n", + " mg_dispensa_1 = mg_dispensa_1.reindex(columns=ordem)\n", + "\n", + " # Append competitive and non competitive\n", + " mg_item_2 = mg_licitacao_1.append([mg_dispensa_1],ignore_index=True)\n", + "\n", + " # Variables format\n", + " floats = ['valor_unitario_cotacao', 'valor_proposta', 'valor_unitario','valor_total',\n", + " 'quantidade','quantidade_cotada']\n", + "\n", + " mg_item_2[floats] = mg_item_2[floats].astype(float)\n", + "\n", + " strings= ['id_licitacao','id_dispensa','id_unidade_gestora']\n", + " mg_item_2[strings] = mg_item_2[strings].astype(str)\n", + "\n", + " mg_item_2['numero_lote'] = mg_item_2['numero_lote'].replace(\"-1\",np.nan)\n", + " mg_item_2['id_licitacao'] = mg_item_2['id_licitacao'].replace(\"nan\",np.nan)\n", + " mg_item_2['id_dispensa'] = mg_item_2['id_dispensa'].replace(\"nan\",np.nan)\n", + "\n", + " # Create total value\n", + " mg_item_2['valor_total'] = np.where((mg_item_2['valor_unitario'].notnull()) | (mg_item_2['quantidade'].notnull()),\n", + " mg_item_2['quantidade'] * mg_item_2['valor_unitario'] , np.nan)\n", + "\n", + " # Create a unique identifier for each item\n", + " mg_item_2['id_item_bd'] = mg_item_2['id_item'] + mg_item_2['id_unidade_gestora'] + mg_item_2['sigla_uf']\n", + "\n", + " # Duplicates only allowed for items supplied by different suppliers\n", + "\n", + " mg_item_2['id_item_bd'] = np.where((mg_item_2.duplicated(['id_item_bd'], keep=False)) & (~mg_item_2.duplicated(['id_item_bd', 'descricao'], keep=False)),\n", + " np.nan, mg_item_2['id_item_bd'])\n", + "\n", + " mg_item_2['id_item_bd'] = np.where((mg_item_2.duplicated(['id_item_bd','descricao','documento','nome_vencedor'], keep=False))\n", + " & (~mg_item_2.duplicated(['id_item_bd', 'descricao','documento','nome_vencedor','quantidade'], keep=False)),\n", + " np.nan, mg_item_2['id_item_bd'])\n", + "\n", + " # Few duplicates in id_item_bd\n", + " mg_item_2.drop_duplicates(inplace=True)\n", + "\n", + " # Reorder columns\n", + " mg_item_2 = mg_item_2.reindex(columns=ordem)\n", + "\n", + " # Partition by year and municipality\n", + " mg_item_2.drop(['ano','sigla_uf'],axis=1,inplace=True)\n", + "\n", + " exec(\"mg_item_2.to_csv('/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao_item/ano={}/sigla_uf=MG/municipio_{}.csv', index=False, encoding='utf-8', na_rep='', float_format='%.2f')\".format(a,m))\n", + "\n", + " # Append all\n", + " mg_item_2['ano'] = a\n", + " mg_item_2['sigla_uf'] = \"MG\"\n", + "\n", + " all_df_mg.append(mg_item_2)\n", + "\n", + "mg_item_2 = pd.concat(all_df_mg, ignore_index=True, sort=True)\n", + "\n", + "mg_item_2.to_csv(os.path.join(path, \"output/temp/mg_item_2.csv\"), index=False, na_rep='', float_format='%.2f')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ncy4wV6s39AR" + }, + "outputs": [], + "source": [ + "anos_mg = ['2018','2019','2020','2021']\n", + "\n", + "all_df_mg=[]\n", + "\n", + "for a in anos_mg:\n", + " for m in municipios_mg:\n", + " df = os.path.join(folder, '{}/licitacao_{}.zip'.format(a,a))\n", + " with ZipFile(df) as z:\n", + "\n", + " try:\n", + " # Competitive procurement items\n", + " with z.open('{}.{}.licitacao.itemLicitacao.csv'.format(a,m)) as f:\n", + " mg1 = pd.read_csv(f,sep=';', encoding='utf-8', dtype=str)\n", + " mg1['id_municipio'] = m\n", + " mg1.rename(columns=mg1_rename, inplace=True)\n", + " mg1.drop(mg1_drop, axis=1, inplace=True)\n", + " mg1 = mg1.reindex(columns=['id_municipio','id_item','id_licitacao','ano','orgao','descricao','unidade_medida','numero','numero_lote'])\n", + " except IOError:\n", + " print(\"Erro de input/output para o município {} e ano {} - mg1\".format(m, a))\n", + "\n", + " try:\n", + " # Quoted items\n", + " with z.open('{}.{}.licitacao.cotacaoLicitacao.csv'.format(a,m)) as f:\n", + " mg2 = pd.read_csv(f,sep=';', encoding='utf-8', dtype=str)\n", + " mg2['id_municipio'] = m\n", + " mg2.rename(columns=mg2_rename, inplace=True)\n", + " mg2.drop(mg2_drop, axis=1, inplace=True)\n", + " mg2 = mg2.reindex(columns=['id_municipio','id_item','id_licitacao','ano','orgao','quantidade_cotada', 'valor_unitario_cotacao'])\n", + " except IOError:\n", + " print(\"Erro de input/output para o município {} e ano {} - mg2\".format(m, a))\n", + "\n", + " try:\n", + " # Items with reference price\n", + " with z.open('{}.{}.licitacao.refLicitacao.csv'.format(a,m)) as f:\n", + " mg3 = pd.read_csv(f,sep=';', encoding='utf-8', dtype=str)\n", + " mg3['id_municipio'] = m\n", + " mg3.rename(columns=mg3_rename, inplace=True)\n", + " mg3.drop(mg3_drop, axis=1, inplace=True)\n", + " mg3 = mg3.reindex(columns=['id_municipio','id_item','id_licitacao','ano','orgao','valor_unitario_cotacao'])\n", + " except IOError:\n", + " print(\"Erro de input/output para o município {} e ano {} - mg3\".format(m, a))\n", + "\n", + " try:\n", + " # Homologated items (suppliers)\n", + " with z.open('{}.{}.licitacao.homologLicitacao.csv'.format(a,m)) as f:\n", + " mg4 = pd.read_csv(f,sep=';', encoding='utf-8', dtype=str)\n", + " mg4['id_municipio'] = m\n", + " mg4.rename(columns=mg4_rename, inplace=True)\n", + " mg4.drop(mg4_drop, axis=1, inplace=True)\n", + " mg4 = mg4.reindex(columns=['id_municipio','id_item','id_licitacao','ano','orgao','quantidade','valor_unitario','nome_vencedor','documento'])\n", + " except IOError:\n", + " print(\"Erro de input/output para o município {} e ano {} - mg4\".format(m, a))\n", + "\n", + " try:\n", + " # Non competitive procurement (dispensa/inexibilidade)\n", + " with z.open('{}.{}.licitacao.itemDispensa.csv'.format(a,m)) as f:\n", + " mg5 = pd.read_csv(f,sep=';', encoding='utf-8', dtype=str)\n", + " mg5['id_municipio'] = m\n", + " mg5.rename(columns=mg5_rename, inplace=True)\n", + " mg5.drop(mg5_drop, axis=1, inplace=True)\n", + " mg5 = mg5.reindex(columns=['id_municipio','id_item','id_dispensa','ano','orgao','descricao','unidade_medida','numero'])\n", + " except IOError:\n", + " print(\"Erro de input/output para o município {} e ano {} - mg5\".format(m, a))\n", + "\n", + " try:\n", + " # Quoted items\n", + " with z.open('{}.{}.licitacao.cotDispensa.csv'.format(a,m)) as f:\n", + " mg6 = pd.read_csv(f,sep=';', encoding='utf-8', dtype=str)\n", + " mg6['id_municipio'] = m\n", + " mg6.rename(columns=mg6_rename, inplace=True)\n", + " mg6.drop(mg6_drop, axis=1, inplace=True)\n", + " mg6 = mg6.reindex(columns=['id_municipio','id_item','id_dispensa','ano','orgao','quantidade_cotada', 'valor_unitario_cotacao'])\n", + " except IOError:\n", + " print(\"Erro de input/output para o município {} e ano {} - mg6\".format(m, a))\n", + "\n", + " try:\n", + " # Suppliers\n", + " with z.open('{}.{}.licitacao.fornDispensa.csv'.format(a,m)) as f:\n", + " mg7 = pd.read_csv(f,sep=';', encoding='utf-8', dtype=str)\n", + " mg7['id_municipio'] = m\n", + " mg7.rename(columns=mg7_rename, inplace=True)\n", + " mg7.drop(mg7_drop, axis=1, inplace=True)\n", + " mg7 = mg7.reindex(columns=['id_municipio','id_item','id_dispensa','ano','orgao','quantidade','valor_unitario','nome_vencedor','documento'])\n", + " except IOError:\n", + " print(\"Erro de input/output para o município {} e ano {} - mg7\".format(m, a))\n", + "\n", + "\n", + " # Merge competitive procurement\n", + "\n", + " # First - merge quoted items with items with reference price\n", + " merge1 = pd.merge(mg2, mg3, how='outer', left_on=['id_municipio','id_item','id_licitacao','ano','orgao','valor_unitario_cotacao'],\n", + " right_on=['id_municipio','id_item','id_licitacao','ano','orgao','valor_unitario_cotacao'])\n", + "\n", + " # Second - merge items general information with their quoted or reference price\n", + " merge2 = pd.merge(mg1, merge1 ,how='left', left_on=['id_municipio','id_item','id_licitacao','ano','orgao'],\n", + " right_on=['id_municipio','id_item','id_licitacao','ano','orgao'])\n", + "\n", + " # Third - merge with homologated items to get suppliers\n", + " mg_licitacao_1= pd.merge(merge2, mg4 ,how='left', left_on=['id_municipio','id_item','id_licitacao','ano','orgao'],\n", + " right_on=['id_municipio','id_item','id_licitacao','ano','orgao'])\n", + "\n", + " mg_licitacao_1['documento']=mg_licitacao_1['documento'].str.strip()\n", + "\n", + " # Merge to get id_unidade_gestora\n", + " mg_licitacao_1 = pd.merge(mg_licitacao_1, ug_id1, how='left', left_on=['ano','id_municipio','id_licitacao'],\n", + " right_on=['ano','id_municipio','id_licitacao'])\n", + "\n", + " # Assign state acronym to the 'sigla_uf'\n", + " mg_licitacao_1['sigla_uf'] = \"MG\"\n", + "\n", + " # Create a unique identifier for each purchase, as in licitacao table\n", + " mg_licitacao_1['id_licitacao_bd'] = mg_licitacao_1['id_licitacao'] + mg_licitacao_1['id_unidade_gestora'] + mg_licitacao_1['sigla_uf']\n", + "\n", + " # Keep only necessary variables\n", + " mg_licitacao_1 = mg_licitacao_1.reindex(columns=ordem)\n", + "\n", + " # Merge non competitive procurement\n", + "\n", + " # First - merge items general information with their quoted price\n", + " merge3 = pd.merge(mg5, mg6, how='outer', left_on=['id_municipio','id_item','id_dispensa','ano','orgao'],\n", + " right_on=['id_municipio','id_item','id_dispensa','ano','orgao'])\n", + "\n", + " # Second - merge with homologated items to get suppliers\n", + " mg_dispensa_1 = pd.merge(merge3, mg7 ,how='left', left_on=['id_municipio','id_item','id_dispensa','ano','orgao'],\n", + " right_on=['id_municipio','id_item','id_dispensa','ano','orgao'])\n", + "\n", + " mg_dispensa_1['documento']=mg_dispensa_1['documento'].str.strip()\n", + "\n", + " # Merge to get id_unidade_gestora\n", + " mg_dispensa_1 = pd.merge(mg_dispensa_1, ug_id2, how='left', left_on=['ano','id_municipio','id_dispensa'],\n", + " right_on=['ano','id_municipio','id_dispensa'])\n", + "\n", + " # Assign state acronym to the 'sigla_uf'\n", + " mg_dispensa_1['sigla_uf'] = \"MG\"\n", + "\n", + " # Create a unique identifier for each purchase, as in licitacao table\n", + " mg_dispensa_1['id_licitacao_bd'] = mg_dispensa_1['id_dispensa'] + mg_dispensa_1['id_unidade_gestora'] + mg_dispensa_1['sigla_uf']\n", + "\n", + " # Keep only necessary variables\n", + " mg_dispensa_1 = mg_dispensa_1.reindex(columns=ordem)\n", + "\n", + " # Append competitive and non competitive\n", + " mg_item_3 = mg_licitacao_1.append([mg_dispensa_1],ignore_index=True)\n", + "\n", + " # Variables format\n", + " floats = ['valor_unitario_cotacao', 'valor_proposta', 'valor_unitario','valor_total',\n", + " 'quantidade','quantidade_cotada']\n", + "\n", + " mg_item_3[floats] = mg_item_3[floats].astype(float)\n", + "\n", + " strings= ['id_licitacao','id_dispensa','id_unidade_gestora']\n", + " mg_item_3[strings] = mg_item_3[strings].astype(str)\n", + "\n", + " mg_item_3['numero_lote'] = mg_item_3['numero_lote'].replace(\"-1\",np.nan)\n", + " mg_item_3['id_licitacao'] = mg_item_3['id_licitacao'].replace(\"nan\",np.nan)\n", + " mg_item_3['id_dispensa'] = mg_item_3['id_dispensa'].replace(\"nan\",np.nan)\n", + "\n", + " # Create total value\n", + "\n", + " mg_item_3['valor_total'] = np.where((mg_item_3['valor_unitario'].notnull()) | (mg_item_3['quantidade'].notnull()),\n", + " mg_item_3['quantidade'] * mg_item_3['valor_unitario'] , np.nan)\n", + "\n", + " # Create a unique identifier for each item\n", + " mg_item_3['id_item_bd'] = mg_item_3['id_item'] + mg_item_3['id_unidade_gestora'] + mg_item_3['sigla_uf']\n", + "\n", + " # Duplicates only allowed for items supplied by different suppliers\n", + "\n", + " mg_item_3['id_item_bd'] = np.where((mg_item_3.duplicated(['id_item_bd'], keep=False)) & (~mg_item_3.duplicated(['id_item_bd', 'descricao'], keep=False)), np.nan, mg_item_3['id_item_bd'])\n", + "\n", + " mg_item_3['id_item_bd'] = np.where((mg_item_3.duplicated(['id_item_bd','descricao','documento'], keep=False)) & (~mg_item_3.duplicated(['id_item_bd', 'descricao','documento','quantidade'], keep=False)), np.nan, mg_item_3['id_item_bd'])\n", + "\n", + " # Few duplicates in id_item_bd\n", + " mg_item_3.drop_duplicates(inplace=True)\n", + "\n", + " mg_item_3 = mg_item_3.reindex(columns=ordem)\n", + "\n", + " # Partition by year and municipality\n", + " mg_item_3.drop(['ano','sigla_uf'],axis=1,inplace=True)\n", + "\n", + " exec(\"mg_item_3.to_csv('/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao_item/ano={}/sigla_uf=MG/municipio_{}.csv', index=False, encoding='utf-8', na_rep='', float_format='%.2f')\".format(a,m))\n", + "\n", + " # Append all\n", + " mg_item_3['ano'] = a\n", + " mg_item_3['sigla_uf'] = \"MG\"\n", + "\n", + " all_df_mg.append(mg_item_3)\n", + "\n", + "mg_item_3 = pd.concat(all_df_mg, ignore_index=True, sort=True)\n", + "\n", + "mg_item_3.to_csv(os.path.join(path, \"output/temp/mg_item_3.csv\"), index=False, na_rep='', float_format='%.2f')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LcSk4bjwQrVW" + }, + "source": [ + "## PR" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MxkZXwdZFLM8" + }, + "outputs": [], + "source": [ + "#PR\n", + "\n", + "# List municipalities to loop\n", + "# Drop municipalities for which we couldn't transform xml in csv\n", + "\n", + "municipio_pr = municipio.query('sigla_uf==\"PR\"')\n", + "\n", + "municipio_pr = municipio_pr.query('id_municipio_6 != \"411915\" & \\\n", + " id_municipio_6 != \"411370\" & \\\n", + " id_municipio_6 != \"411535\" & \\\n", + " id_municipio_6 != \"411710\" & \\\n", + " id_municipio_6 != \"412627\" & \\\n", + " id_municipio_6 != \"410140\" & \\\n", + " id_municipio_6 != \"410350\"')\n", + "\n", + "municipios_pr = municipio_pr['id_municipio_6'].tolist()\n", + "\n", + "# Rename and list variables to drop\n", + "\n", + "pr_columns = ['cdIBGE','idlicitacao','idPessoa','nrAnoLicitacao','dsItem','dsUnidadeMedida','nmPessoa','nrDocumento','nrItem',\n", + " 'nrLote','nrQuantidadePropostaLicitacao','nrQuantidadeVencedorLicitacao','vlLicitacaoVencedorLicitacao',\n", + " 'vlPropostaItem','nrClassificacao']\n", + "\n", + "pr_rename = {'cdIBGE':'id_municipio', 'idlicitacao':'id_licitacao', 'idPessoa':'id_unidade_gestora',\n", + " 'nrAnoLicitacao':'ano', 'dsItem':'descricao', 'dsUnidadeMedida':'unidade_medida',\n", + " 'nmPessoa':'nome_vencedor','nrDocumento':'documento','nrItem':'numero','nrLote':'numero_lote',\n", + " 'nrQuantidadePropostaLicitacao':'quantidade_proposta','nrQuantidadeVencedorLicitacao':'quantidade',\n", + " 'vlLicitacaoVencedorLicitacao':'valor_vencedor', 'vlPropostaItem':'valor_proposta',\n", + " 'nrClassificacao':'numero_classificacao'}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xjPI5FUIQueQ" + }, + "outputs": [], + "source": [ + "anos_pr = ['2013','2014','2015','2016','2017','2018','2019','2020','2021']\n", + "\n", + "all_df_pr = []\n", + "\n", + "for a in anos_pr:\n", + " for m in municipios_pr:\n", + "\n", + " exec(\"path_lic_venc = '/content/gdrive/MyDrive/ComprasPublicas_Brasil/input/PR/{}/Licitacao/{}/{}_{}_LicitacaoVencedor.csv'\".format(a, m, a, m))\n", + "\n", + " pr = pd.read_csv(path_lic_venc, sep=',', encoding='utf-8', dtype=str) #usecols = pr_columns\n", + " pr.rename(pr_rename, axis=1, inplace=True)\n", + "\n", + " # Merge id_municipio 6 and 7 digits id\n", + " pr = pd.merge(pr, municipio, how='left', left_on='id_municipio', right_on='id_municipio_6')\n", + "\n", + " pr.drop(['id_municipio_x','id_municipio_6','nome', 'id_municipio_tce'],axis=1, inplace=True)\n", + " pr.rename({'id_municipio_y':'id_municipio'},axis=1, inplace=True)\n", + "\n", + " # Format\n", + " pr['documento'] = pr['documento'].str.replace(\"-\",\"\")\n", + " pr['documento'] = pr['documento'].str.replace(\".\",\"\")\n", + " pr['documento']= pr['documento'].str.strip()\n", + "\n", + " # Create a unique identifier for each purchase\n", + " pr['id_licitacao_bd'] = pr['id_licitacao'] + pr['id_unidade_gestora'] + pr['sigla_uf']\n", + "\n", + " # Create a unique identifier for each item\n", + " pr['id_item'] = pr['numero'] + ' ' + pr['numero_lote'] + ' ' + pr['id_licitacao'] + ' ' + pr['id_municipio'].str[4:]\n", + "\n", + " # Drop non suppliers\n", + " pr['min_classificacao'] = pr.groupby(['ano','id_municipio','id_licitacao','numero_lote','numero'])['numero_classificacao'].transform('min')\n", + " pr['diff'] = np.where(pr['numero_classificacao'] == pr['min_classificacao'], 0, 1)\n", + " pr = pr[pr['diff']==0]\n", + "\n", + " # Create a unique identifier for each item across states\n", + " pr['id_item_bd'] = pr['id_item'] + pr['sigla_uf']\n", + "\n", + " # Duplicates only allowed for items supplied by different suppliers\n", + "\n", + " pr['id_item_bd'] = np.where((pr.duplicated(['id_item_bd'], keep=False)) & (~pr.duplicated(['id_item_bd', 'descricao'], keep=False)), np.nan, pr['id_item_bd'])\n", + "\n", + " # Drop duplicated in all variables\n", + " pr.drop_duplicates(inplace=True)\n", + "\n", + " # Adds zeros to the left, missing in some cnpjs\n", + " pr['length'] = pr['documento'].str.len()\n", + " pr['documento'] = np.where(pr['length'] > 11, pr['documento'].astype(str).str.zfill(14), pr['documento'])\n", + "\n", + " # Variables format\n", + " floats = ['valor_proposta', 'valor_vencedor']\n", + " pr[floats] = pr[floats].astype(float)\n", + "\n", + " pr['quantidade'] = pd.to_numeric(pr['quantidade'] , errors='coerce').fillna(-1).astype(int)\n", + " pr['quantidade'] = pr['quantidade'].replace(-1,'')\n", + "\n", + " pr['quantidade_proposta'] = pd.to_numeric(pr['quantidade_proposta'] , errors='coerce').fillna(-1).astype(int)\n", + " pr['quantidade_proposta'] = pr['quantidade_proposta'].replace(-1,'')\n", + "\n", + " # Reorder columns\n", + " pr = pr.reindex(columns=ordem)\n", + "\n", + " # Partition by year and municipality\n", + " pr.drop(['ano','sigla_uf'],axis=1,inplace=True)\n", + "\n", + " exec(\"pr.to_csv('/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao_item/ano={}/sigla_uf=PR/municipio_{}.csv', index=False, encoding='utf-8', na_rep='', float_format='%.2f')\".format(a,m))\n", + "\n", + " # Append all\n", + " pr['ano'] = a\n", + " pr['sigla_uf'] = \"PR\"\n", + "\n", + " pr = pr.reindex(columns=ordem)\n", + "\n", + " all_df_pr.append(pr)\n", + "\n", + "item_pr = pd.concat(all_df_pr, ignore_index=True, sort=True)\n", + "\n", + "item_pr.to_csv(os.path.join(path, \"output/licitacao_item_pr.csv\"), index=False, na_rep='', float_format='%.2f')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cUrX1vXRYFeX" + }, + "source": [ + "## RS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "T1ZazlULYLfg" + }, + "outputs": [], + "source": [ + "# List municipalities to loop\n", + "\n", + "municipio_rs = municipio.query('sigla_uf==\"RS\"')\n", + "municipios_rs = municipio_rs['id_municipio'].tolist()\n", + "\n", + "# Rename and list variables to drop\n", + "\n", + "rs_rename = {'ANO_LICITACAO':'ano','CD_MUNICIPIO_IBGE':'id_municipio','DS_ITEM':'descricao','NR_ITEM':'numero',\n", + " 'NR_LOTE':'numero_lote','QT_ITENS':'quantidade_cotada','SG_UNIDADE_MEDIDA':'unidade_medida',\n", + " 'VL_UNITARIO_ESTIMADO':'valor_unitario_cotacao','VL_UNITARIO_HOMOLOGADO':'valor_unitario',\n", + " 'VL_TOTAL_HOMOLOGADO':'valor_total','NR_DOCUMENTO':'documento','NR_LICITACAO':'id_licitacao',\n", + " 'TP_DOCUMENTO.1':'TP_DOCUMENTO_2','CD_TIPO_MODALIDADE':'modalidade'}\n", + "\n", + "rs_drop = ['BL_COVID19','CD_FONTE_REFERENCIA','CD_TIPO_FAMILIA','CD_TIPO_SUBFAMILIA','DS_FONTE_REFERENCIA','DT_REF_VALOR_ESTIMADO',\n", + " 'PC_ENCARGOS_SOCIAIS_ESTIMADO','PC_ENCARGOS_SOCIAIS_HOMOLOGADO','PC_TX_ESTIMADA','TP_ORCAMENTO',\n", + " 'PC_TX_HOMOLOGADA','TP_BENEFICIO_MICRO_EPP','PC_BDI_ESTIMADO',\t'PC_BDI_HOMOLOGADO','NR_ITEM_ORIGINAL',\n", + " 'TP_RESULTADO_ITEM','NR_DOCUMENTO.1','TP_DOCUMENTO','TP_DOCUMENTO_2']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "aiFDwFk4sUoh" + }, + "outputs": [], + "source": [ + "#item.csv\n", + "#licitacao.csv\n", + "#pessoas.csv\n", + "\n", + "# RS folder\n", + "folder = os.path.join(path,\"input/RS\")\n", + "\n", + "anos_rs = ['2016','2017','2018','2019','2020','2021']\n", + "\n", + "all_df_rs=[]\n", + "for a in anos_rs:\n", + " df = os.path.join(folder, 'Licitacao/{}.csv.zip'.format(a,a))\n", + " with ZipFile(df) as z:\n", + " with z.open('item.csv') as f:\n", + " rs = pd.read_csv(f,sep=',', encoding='utf-8', dtype=str)\n", + "\n", + " with z.open('licitacao.csv') as f:\n", + " rs2 = pd.read_csv(f,sep=',', encoding='utf-8', dtype=str,\n", + " usecols = ['CD_ORGAO','NR_LICITACAO','ANO_LICITACAO','CD_TIPO_MODALIDADE','TP_DOCUMENTO_FORNECEDOR','NR_DOCUMENTO_FORNECEDOR'])\n", + "\n", + " # Merge to get some cnpjs\n", + " rs = pd.merge(rs, rs2, how='left',left_on=['CD_ORGAO','NR_LICITACAO','ANO_LICITACAO','CD_TIPO_MODALIDADE'],\n", + " right_on=['CD_ORGAO','NR_LICITACAO','ANO_LICITACAO','CD_TIPO_MODALIDADE'])\n", + "\n", + " # Concat document variables into one variable\n", + " rs['NR_DOCUMENTO'] = np.where(rs['NR_DOCUMENTO'].isna(), rs['NR_DOCUMENTO.1'], rs['NR_DOCUMENTO'])\n", + " rs['NR_DOCUMENTO'] = np.where(rs['NR_DOCUMENTO'].isna(), rs['NR_DOCUMENTO_FORNECEDOR'], rs['NR_DOCUMENTO'])\n", + "\n", + " rs.rename(rs_rename, axis=1, inplace=True)\n", + "\n", + " # Set as missing documents with strings (razao_social)\n", + " rs['documento'] = np.where(rs['documento'].str.isnumeric(), rs['documento'], np.nan)\n", + "\n", + " # Add zeros to the left, missing in some cnpjs\n", + " rs['length'] = rs['documento'].str.len()\n", + " rs['documento'] = np.where((rs['length'] > 11) & ((rs['TP_DOCUMENTO']==\"J\") | (rs['TP_DOCUMENTO_2']==\"J\") | (rs['TP_DOCUMENTO_FORNECEDOR']==\"J\")),\n", + " rs['documento'].str.zfill(14), rs['documento'])\n", + "\n", + " rs.drop(rs_drop, axis=1, inplace=True)\n", + "\n", + " # Assign state acronym to the 'sigla_uf'\n", + " rs['sigla_uf'] = \"RS\"\n", + "\n", + " # Merge to get id_municipio\n", + "\n", + " rs = pd.merge(rs, orgao_municipio, how='left',left_on='CD_ORGAO', right_on='CD_ORGAO', indicator=True) # alguns ids missing\n", + "\n", + " rs.rename({'CD_MUNICIPIO_IBGE':'id_municipio','CD_ORGAO':'orgao'},axis=1, inplace=True)\n", + "\n", + " rs = rs[rs['_merge']==\"both\"]\n", + " rs.drop('_merge', axis=1, inplace=True)\n", + "\n", + " # Create a unique identifier for each purcase, as in licitacao table\n", + " rs['id_licitacao_bd'] = rs['id_licitacao'] + rs['ano'] + rs['modalidade'] + rs['orgao'] + rs['sigla_uf']\n", + "\n", + " rs = rs.drop(rs[(rs['modalidade'] == \"MAI\")].index)\n", + "\n", + " # Create a unique identifier for each item\n", + " rs['id_item'] = rs['numero'] + ' ' + rs['numero_lote'] + ' ' + rs['id_licitacao'] + ' ' + rs['ano'] + ' ' + rs['modalidade'] + ' ' + rs['orgao'] #few duplicates yet\n", + "\n", + " # Create a unique identifier for each item across states\n", + " rs['id_item_bd'] = rs['id_item'] + rs['sigla_uf']\n", + "\n", + " # Open file with participants information\n", + "\n", + " with z.open('pessoas.csv') as f:\n", + " rs3 = pd.read_csv(f,sep=',', encoding='utf-8', dtype=str, usecols = ['NR_DOCUMENTO','NM_PESSOA','TP_PESSOA'])\n", + "\n", + " rs3.rename({'NM_PESSOA':'nome_vencedor','NR_DOCUMENTO':'documento','TP_PESSOA':'tipo'},axis=1,inplace=True)\n", + "\n", + " # Type of supplier - firm, person or international\n", + " rs3['tipo'] = rs3['tipo'].replace(['J','F','E','P'],['1','2','3',''])\n", + "\n", + " # Replace documents containing strings with missing (usually suppliers name)\n", + " rs3['documento'] = np.where(rs3['documento'].str.isnumeric(), rs3['documento'], np.nan)\n", + "\n", + " # Add zeros to the left, missing in some cnpjs\n", + " rs3['length'] = rs3['documento'].str.len()\n", + " rs3['documento'] = np.where((rs3['length'] > 11) & (rs3['tipo']==\"1\"), rs3['documento'].str.zfill(14), rs3['documento'])\n", + "\n", + " rs3.drop(['tipo','length'],axis=1,inplace=True)\n", + "\n", + " rs3.drop_duplicates(subset=['documento'],inplace=True) #same document in many rows (same suppliers)\n", + "\n", + " # Merge to get suppliers name\n", + " rs = pd.merge(rs, rs3, how='left', left_on=['documento'], right_on=['documento'])\n", + "\n", + " # Format\n", + "\n", + " rs['valor_unitario']=rs['valor_unitario'].astype(float)\n", + " rs['valor_total'] = rs['valor_total'].replace('###############', np.nan)\n", + " rs['valor_total']=rs['valor_total'].astype(float)\n", + "\n", + " rs['quantidade'] = rs['valor_total']/rs['valor_unitario']\n", + "\n", + " rs['quantidade'] = rs['quantidade'].replace(np.nan, -1)\n", + " rs['quantidade'] = rs['quantidade'].astype(int)\n", + " rs['quantidade'] = rs['quantidade'].replace(-1, '')\n", + "\n", + " rs['quantidade_cotada'] = rs['quantidade_cotada'].astype(float)\n", + " rs['quantidade_cotada'] = rs['quantidade_cotada'].astype(int)\n", + "\n", + " # Drop duplicates in all variables\n", + " rs.drop_duplicates(inplace=True)\n", + "\n", + " # Duplicates not allowed for same id_item_bd, but different descriptions\n", + " rs['id_item_bd'] = np.where((rs.duplicated(['id_item_bd'], keep=False)) & (~rs.duplicated(['id_item_bd', 'descricao'], keep=False)), np.nan, rs['id_item_bd'])\n", + "\n", + " # Reorder columns\n", + " rs = rs.reindex(columns=ordem)\n", + "\n", + " # Append all\n", + " all_df_rs.append(rs)\n", + "\n", + "rs = pd.concat(all_df_rs, ignore_index=True, sort=True)\n", + "\n", + "# Save\n", + "rs.to_csv(os.path.join(path, \"output/licitacao_item_rs.csv\"), index=False, na_rep='', float_format='%.2f')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Yn8yojD2jVh9" + }, + "source": [ + "## Partition" + ] + }, + { + "cell_type": "code", + "source": [ + "# List of UFs\n", + "ufs = ['CE', 'RS']\n", + "\n", + "# Loop over each UF\n", + "for uf in ufs:\n", + " # Load the corresponding CSV file for the UF\n", + " file_path = f'/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao_item_{uf.lower()}.csv'\n", + " df = pd.read_csv(file_path, dtype=str, encoding='utf-8')\n", + "\n", + " # Convert 'ano' column to integer\n", + " df['ano'] = df['ano'].astype(int)\n", + "\n", + " # Save cvs by year and state\n", + "\n", + " for ano in [*range(2012, 2022)]:\n", + " for uf in ufs:\n", + " if uf == 'CE' and ano in [*range(2009, 2022)]:\n", + " print(\"Particionando {} do CE\".format(ano))\n", + " df2 = df[df['ano'] == ano]\n", + " df2.drop(['ano', 'sigla_uf'], axis=1, inplace=True)\n", + " exec(\"df2.to_csv('/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao_item/ano={}/sigla_uf=CE/microdados.csv', index=False, encoding='utf-8', na_rep='', float_format='%.2f')\".format(ano))\n", + " if uf == 'RS' and ano in [*range(2016, 2022)]:\n", + " print(\"Particionando {} do RS\".format(ano))\n", + " df2 = df[df['ano'] == ano]\n", + " df2.drop(['ano', 'sigla_uf'], axis=1, inplace=True)\n", + " exec(\"df2.to_csv('/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao/ano={}/sigla_uf=RS/microdados.csv', index=False, encoding='utf-8', na_rep='', float_format='%.2f')\".format(ano))" + ], + "metadata": { + "id": "HHafZnjD8OT-" + }, + "execution_count": null, + "outputs": [] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [ + "hbqe1K4Nt4Hr", + "B5DfSEqUrWqH", + "cUrX1vXRYFeX" + ], + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/bases/world_wb_mides/code/licitacao_participante.ipynb b/bases/world_wb_mides/code/licitacao_participante.ipynb new file mode 100644 index 000000000..78b6abb50 --- /dev/null +++ b/bases/world_wb_mides/code/licitacao_participante.ipynb @@ -0,0 +1,1012 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "**Preamble**\n", + "* This code cleans the raw public procurement data obtained from the State Audit Courts (TCEs) of the following states: CE, PE, MG, PR, RS and PB.\n", + "* The final output of this code is the tender participant table (*licitacao participante*), available at [basedosdados](https://basedosdados.org/dataset/d3874769-bcbd-4ece-a38a-157ba1021514?table=14c5d05b-9830-4710-b7ac-7e0ca1bf9d8b).\n", + "* Made by: Nathalia Sales" + ], + "metadata": { + "id": "J85m26Dmr99k" + } + }, + { + "cell_type": "code", + "source": [ + "# Connect to google drive\n", + "\n", + "from google.colab import drive\n", + "drive.mount('/content/gdrive')\n", + "\n", + "# Necessary packages\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "import glob\n", + "from zipfile import ZipFile\n", + "from datetime import datetime\n", + "\n", + "# Display options\n", + "\n", + "pd.set_option('display.max_columns', None)\n", + "pd.options.display.float_format = '{:.2f}'.format\n", + "\n", + "# Set directory\n", + "\n", + "path = '/content/gdrive/MyDrive/ComprasPublicas_Brasil'\n", + "\n", + "# Open some auxiliary files\n", + "\n", + "municipio = pd.read_csv(os.path.join(path, \"auxiliary_files/municipio.csv\"), encoding='utf-8', dtype=str)\n", + "\n", + "id_tce = pd.read_csv(os.path.join(path, \"input/PE/municipios.csv\"), encoding='latin-1',dtype=str,\n", + " usecols = ['CODIGOIBGE','CODIGO','UNIDADEFEDERATIVA'])\n", + "\n", + "id_tce.rename(columns={'CODIGOIBGE':'id_municipio', 'CODIGO':'id_municipio_tce', 'UNIDADEFEDERATIVA':'sigla_uf'}, inplace=True)\n", + "\n", + "# Merge both\n", + "municipio = pd.merge(municipio, id_tce, how='left', left_on=['id_municipio', 'sigla_uf'], right_on=['id_municipio', 'sigla_uf'])\n", + "\n", + "ug_id = pd.read_csv(os.path.join(path, \"auxiliary_files/ug_id_mg.csv\"), sep=',', dtype=str) # MG\n", + "\n", + "orgao_municipio = pd.read_csv(os.path.join(path, \"input/RS/orgaos_auditados_rs.csv\"), encoding='utf-8',dtype=str,\n", + " usecols=['CD_MUNICIPIO_IBGE', 'CD_ORGAO']) # RS\n", + "\n", + "# Create a list of UFs\n", + "ufs = municipio['sigla_uf'].unique().tolist()\n", + "\n", + "# Set columns order\n", + "\n", + "ordem = ['ano','sigla_uf', 'id_municipio', 'orgao','id_unidade_gestora', 'id_licitacao_bd','id_licitacao','id_dispensa',\n", + " 'razao_social','documento', 'habilitado', 'classificado','vencedor', 'endereco', 'cep', 'municipio_participante', 'tipo']" + ], + "metadata": { + "id": "zQzslM-4eW1K" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# CE" + ], + "metadata": { + "id": "tZ9QQZCMz7e9" + } + }, + { + "cell_type": "code", + "source": [ + "#CE\n", + "\n", + "# Get a list of all CSV files\n", + "\n", + "all_files_ce_licitantes = glob.glob(os.path.join(path,\"input/CE/Licitações/licitantes_*.csv\"))\n", + "\n", + "# Initialize an empty list and loop through each CSV file\n", + "\n", + "all_df_ce = []\n", + "for f in all_files_ce_licitantes:\n", + " df1 = pd.read_csv(f, sep=';', encoding='latin-1', dtype=str)\n", + " df1['arquivo'] = f.split('/')[-1]\n", + " all_df_ce.append(df1)\n", + "\n", + "# Concatenate all DataFrames in the list into a single DataFrame\n", + "\n", + "ce1 = pd.concat(all_df_ce, ignore_index=True, sort=True)\n", + "\n", + "# Extract the year from the file name\n", + "ce1['ano'] = ce1['arquivo'].str[11:15]\n", + "\n", + "# Drop and rename\n", + "ce1_drop = ['fone_negociante','arquivo','codigo_uf ']\n", + "\n", + "ce1_rename = {'numero_licitacao':'id_licitacao', 'codigo_tipo_negociante':'tipo',\n", + " 'nome_negociante':'razao_social', 'endereco_negociante':'endereco', 'cep_negociante':'cep',\n", + " 'nome_municipio_negociante':'municipio_participante', 'numero_documento_negociante':'documento'}\n", + "\n", + "ce1.drop(ce1_drop, axis=1, inplace=True)\n", + "ce1.rename(ce1_rename, axis=1, inplace=True)\n", + "\n", + "# Read a CSV file containing municipality information\n", + "\n", + "id_mun = pd.read_csv(os.path.join(path, \"input/CE/municipios.csv\"), sep=';', dtype=str, encoding='latin-1',\n", + " usecols=['geoibgeId','codigo_municipio'])\n", + "\n", + "id_mun.rename({'geoibgeId':'id_municipio'}, axis=1, inplace=True)\n", + "\n", + "# Merge on codigo_municipio to get id_municipio (IBGE code)\n", + "\n", + "ce1 = pd.merge(ce1, id_mun, how='left', left_on='codigo_municipio', right_on='codigo_municipio')\n", + "\n", + "# Assign state acronym to the 'sigla_uf'\n", + "ce1['sigla_uf']='CE'\n", + "\n", + "# Create a unique identifier for each purchase\n", + "ce1['id_licitacao_bd'] = ce1['id_licitacao'] + ce1['id_municipio'] + ce1['ano'].str[2:4] + ce1['sigla_uf']\n", + "\n", + "# Merge with licitacao to deal with duplicates in id_licitacao_bd\n", + "\n", + "ce2 = pd.read_csv(os.path.join(path, \"output/licitacao_ce.csv\"), dtype=str, encoding='utf-8',\n", + " usecols=['id_municipio','ano','id_licitacao','id_licitacao_bd'])\n", + "\n", + "ce2.rename({'id_licitacao_bd':'id_licitacao_bd_2'}, axis=1, inplace=True)\n", + "\n", + "ce1 = pd.merge(ce1, ce2, how='left', left_on=['id_municipio','ano','id_licitacao'],\n", + " right_on=['id_municipio','ano','id_licitacao'], indicator=True)\n", + "\n", + "# If id_licitacao (which comes from the original variable numero_licitacao), ano and id_municipio are the same\n", + "# in both licitacao and participants table, but id_licitacao_bd is different, it must be the case that we cannot\n", + "# uniquely identify that tender. In those cases, we set the id as missing.\n", + "\n", + "ce1['id_licitacao_bd'] = np.where((ce1['_merge']==\"both\") & (ce1['id_licitacao_bd'] != ce1['id_licitacao_bd_2']), np.nan, ce1['id_licitacao_bd'])\n", + "\n", + "# Drop non-necessary variable\n", + "ce1.drop('data_realizacao_licitacao', axis=1, inplace=True)\n", + "\n", + "# Apply standard format to documents/cnpjs\n", + "ce1['length'] = ce1['documento'].str.len()\n", + "ce1['documento'] = np.where(ce1['length'] == 13, ce1['documento'].str.zfill(14), ce1['documento'])\n", + "ce1['documento'] = np.where(ce1['length'] == 15, ce1['documento'].str[1:], ce1['documento'])\n", + "\n", + "# Merge with items to get winners\n", + "\n", + "ce3 = pd.read_csv(os.path.join(path, \"output/licitacao_item_ce.csv\"),encoding='utf-8', dtype=str,\n", + " usecols=['id_municipio','ano','id_licitacao_bd','documento'])\n", + "\n", + "# One supplier may appear many times in the same tender by winning different items\n", + "# Keep it unique\n", + "\n", + "ce3.drop_duplicates(subset=['documento','id_licitacao_bd'], inplace=True)\n", + "\n", + "ce3['vencedor']=\"1\"\n", + "\n", + "# Merge suppliers and items\n", + "ce = pd.merge(ce1, ce3, how='left', left_on=['id_municipio','ano','id_licitacao_bd','documento'],\n", + " right_on=['id_municipio','ano','id_licitacao_bd','documento'])\n", + "\n", + "ce['vencedor'] =ce['vencedor'].replace(np.nan,\"0\")\n", + "\n", + "# Drop duplicates in all variables\n", + "ce.drop_duplicates(inplace=True)\n", + "\n", + "# Six cases where there are small differences in address, keep the last\n", + "ce.drop_duplicates(subset=['documento','id_licitacao_bd'], inplace=True, keep='last')\n", + "\n", + "# Reorder columns\n", + "ce = ce.reindex(columns=ordem)\n", + "\n", + "# Save\n", + "ce.to_csv(os.path.join(path,\"output/licitacao_participante_ce.csv\"), index=False, na_rep='', float_format='%.2f')" + ], + "metadata": { + "id": "I_ZExQ_qz-aI" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# PE" + ], + "metadata": { + "id": "hbqe1K4Nt4Hr" + } + }, + { + "cell_type": "code", + "source": [ + "#PE\n", + "\n", + "# Get a list of all CSV files\n", + "\n", + "all_files = glob.glob(os.path.join(path,\"input/PE/Licitações/licitacoesdetalhes_*.csv\"))\n", + "\n", + "# Initialize an empty list and loop through each CSV file\n", + "\n", + "all_df = []\n", + "for f in all_files:\n", + " df1 = pd.read_csv(f, sep=',', encoding='latin-1', dtype=str)\n", + " df1['arquivo'] = f.split('/')[-1]\n", + " all_df.append(df1)\n", + "\n", + "# Concatenate all DataFrames in the list into a single DataFrame\n", + "\n", + "pe = pd.concat(all_df, ignore_index=True, sort=True)\n", + "\n", + "# Extract the year from the file name\n", + "\n", + "pe['ano'] = pe['arquivo'].str[19:23]\n", + "\n", + "# List original variables to drop\n", + "\n", + "columns_pe = ['DESCRICAOOBJETO', 'DATAPUBLICACAOHOMOLOGACAO', 'LinkArquivo', 'CODIGOOBJETO', 'VALORORCAMENTOESTIMATIVO',\n", + " 'CODIGOSITUACAOLICITACAO', 'UG', 'ESTAGIOLICITACAO', 'NOMEMODALIDADE', 'OBJETOCONFORMEEDITAL', 'CODIGODESCRICAOOBJETO',\n", + " 'QTDELICITANTES', 'NOMENATUREZA', 'TOTALADJUDICADOLICITANTE', 'NUMEROMODALIDADE', 'CODIGOESTAGIOLICITACAO',\n", + " 'TOTALADJUDICADOLICITACAO', 'NUMEROPROCESSO', 'CARACTERISTICAOBJETO', 'CODIGONATUREZA', 'SITUACAOLICITACAO',\n", + " 'DATAPUBLICACAOHABILITACAO', 'ANOMODALIDADE', 'ESPECIFICACAOOBJETO', 'DATAEMISSAOEDITAL','ANOPROCESSO','arquivo',\n", + " 'DATASESSAOABERTURA','DOTACAOORCAMENTARIA','FUNDAMENTOLEGAL','RESULTADOHABILITACAO','ADJUDICADA','CODIGOMUNICIPIO']\n", + "\n", + "# Dictionary\n", + "\n", + "pe_rename = {'CODIGOUG':'id_unidade_gestora', 'NUMERODOCUMENTOAJUSTADO':'documento', 'CODIGOPL':'id_licitacao',\n", + " 'RAZAOSOCIAL':'razao_social'}\n", + "\n", + "status = {'Vencedor':'1', 'Não Vencedor':'0'}\n", + "habilitacao = {'Habilitado':'1', 'Inabilitado':'0', 'Não analisado':'0', 'Dispensado':'0'}\n", + "\n", + "# Drop and rename\n", + "\n", + "pe.rename(pe_rename, axis=1, inplace=True)\n", + "\n", + "pe['vencedor'] = pe['ADJUDICADA'].map(status)\n", + "pe['habilitado'] = pe['RESULTADOHABILITACAO'].map(habilitacao)\n", + "\n", + "pe.drop(columns_pe, axis=1, inplace=True)\n", + "\n", + "# Read a CSV file containing municipality information\n", + "\n", + "ug = pd.read_csv(os.path.join(path, \"input/PE/unidadesjurisdicionadas.csv\"), sep=',', encoding='latin-1',dtype=str)\n", + "\n", + "ug_drop = ['CODIGOTCE', 'ESFERA', 'PODER','UNIDADEFEDERATIVA', 'NATUREZA', 'TIPOPESSOAJURIDICA', 'ORGAO', 'MUNICIPIO', 'SIGLA', 'SITUACAO', 'CNPJ']\n", + "ug_rename = {'CODIGOMUNICIPIO':'id_municipio_tce','ID_UNIDADE_GESTORA': 'id_unidade_gestora'}\n", + "\n", + "ug.drop(ug_drop, axis=1, inplace=True)\n", + "ug.rename(columns = ug_rename, inplace=True)\n", + "\n", + "# Merge on id_unidade_gestora to get id_municipio_tce (TCE code)\n", + "\n", + "pe = pd.merge(pe, ug, how='left',left_on=['id_unidade_gestora'], right_on=['id_unidade_gestora'], indicator=True)\n", + "\n", + "pe.drop('_merge',axis=1, inplace=True)\n", + "\n", + "# Merge on id_municipio_tce to get id_municipio (IBGE code)\n", + "\n", + "pe = pd.merge(pe, municipio, how='left', left_on='id_municipio_tce', right_on='id_municipio_tce')\n", + "\n", + "pe.drop(['nome','id_municipio_6','id_municipio_tce'],axis=1, inplace=True)\n", + "\n", + "# Many documents with zeros: 12718 observations (just for non-winners)\n", + "# Apparently deserted bids\n", + "\n", + "# Create type variable based on document\n", + "# 1 - CNPJ\n", + "# 2 - CPF\n", + "\n", + "pe['documento'] = pe['documento'].str.replace(\"-\",\"\")\n", + "pe['documento'] = pe['documento'].str.replace(\".\",\"\")\n", + "pe['documento']=pe['documento'].str.strip()\n", + "\n", + "pe['length'] = pe['documento'].str.len()\n", + "\n", + "conditions = [\n", + " (pe['length'] == 14),\n", + " (pe['length'] == 11),\n", + " (pe['length'] != 11) & (pe['length'] != 14)\n", + "]\n", + "categories = ['1', '2', '']\n", + "pe['tipo'] = np.select(conditions, categories)\n", + "\n", + "pe['tipo'] = pe['tipo'].replace('',np.nan)\n", + "\n", + "# Create a unique identifier for each purchase\n", + "\n", + "pe['id_licitacao_bd'] = pe['id_licitacao'] + pe['id_unidade_gestora'] + pe['sigla_uf']\n", + "\n", + "# Reorder columns\n", + "\n", + "pe = pe.reindex(columns=ordem)\n", + "\n", + "# Save\n", + "\n", + "pe.to_csv(os.path.join(path, \"output/licitacao_participante_pe.csv\"), index=False, na_rep='', float_format='%.2f')" + ], + "metadata": { + "id": "Gg5yvAkiuUUT" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# MG" + ], + "metadata": { + "id": "B5DfSEqUrWqH" + } + }, + { + "cell_type": "code", + "source": [ + "# List municipalities to loop\n", + "\n", + "municipio_mg = municipio.query('sigla_uf==\"MG\"')\n", + "municipios_mg = municipio_mg['id_municipio'].tolist()\n", + "\n", + "# List years\n", + "anos_mg = ['2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021']\n", + "\n", + "# MG folder\n", + "pasta = os.path.join(path, \"input/MG\")\n", + "\n", + "# Rename and drop variables\n", + "\n", + "mg_rename = {'seq_licitacao':'id_licitacao', 'seq_orgao':'orgao',\n", + " 'num_ano_referencia':'ano', 'num_documento':'documento', 'nom_pessoa':'razao_social'}\n", + "\n", + "mg_drop = ['seq_hab_licitacao', 'num_mes_referencia', 'dsc_objeto_social', 'dsc_tipo_orgao_resp',\n", + " 'dat_registro', 'num_registro', 'dat_registro_cvm', 'num_registro_cvm', 'num_inscr_estadual',\n", + " 'dsc_estado_inscr', 'num_certidao_inss', 'dat_emi_cert_inss', 'dat_val_cert_inss',\n", + " 'num_certidao_fgts', 'dat_emi_cert_fgts', 'dat_val_cert_fgts', 'num_cndt', 'dat_emi_cndt',\n", + " 'dat_val_cndt', 'dat_habilitacao', 'dsc_ind_presenca_licit', 'dsc_ind_renuncia', 'num_versao_arq']\n", + "\n", + "# Initialize an empty list and loop through each CSV file\n", + "# Qualified participants (Habilitados)\n", + "\n", + "all_df_mg = []\n", + "for a in anos_mg:\n", + " for m in municipios_mg:\n", + " path = os.path.join(pasta, '{}/licitacao_{}.zip'.format(a,a))\n", + " if a in ['2014', '2015', '2016']:\n", + " arquivo = f'licitacao/{m}/{a}.{m}.licitacao.habLicitacao.csv'\n", + " elif a == '2017':\n", + " arquivo = f'{a}/licitacao/{m}/{a}.{m}.licitacao.habLicitacao.csv'\n", + " else:\n", + " arquivo = f'{a}.{m}.licitacao.habLicitacao.csv'\n", + "\n", + " with ZipFile(path) as z:\n", + " try:\n", + " with z.open(arquivo) as f:\n", + " df = pd.read_csv(f, sep=';', encoding='utf-8', dtype=str)\n", + " df['id_municipio'] = m\n", + " df.rename(mg_rename, axis=1, inplace=True)\n", + " df.drop(mg_drop, axis=1, inplace=True)\n", + " df['habilitado'] = \"1\"\n", + " df['sigla_uf'] = \"MG\"\n", + " all_df_mg.append(df)\n", + " except IOError:\n", + " print(\"Erro de input/output para o município {} e ano {}\".format(m, a))\n", + "\n", + "mg_habilitados = pd.concat(all_df_mg, ignore_index=True, sort=True)\n", + "\n", + "# Drop duplicates (0.02% only change in seq_hab_licitacao)\n", + "\n", + "mg_habilitados.drop_duplicates(subset=['documento','razao_social','id_licitacao','id_municipio','ano','orgao'],inplace=True)\n", + "\n", + "# Merge to get id_unidade_gestora\n", + "# 120 on left_only\n", + "\n", + "mg_habilitados = pd.merge(mg_habilitados, ug_id, how='left',\n", + " left_on=['ano','id_municipio','id_licitacao'],\n", + " right_on=['ano','id_municipio','id_licitacao'])\n", + "\n", + "# Create a unique identifier for each purchase\n", + "\n", + "mg_habilitados['id_licitacao_bd'] = mg_habilitados['id_licitacao'] + mg_habilitados['id_unidade_gestora'] + mg_habilitados['sigla_uf']\n", + "\n", + "# We still need to get suppliers\n", + "\n", + "# Open items files and select useful variables\n", + "chunksize = 10000\n", + "csv_files = [os.path.join(path,\"output/temp/mg_item_1.csv\"),\n", + " os.path.join(path,\"output/temp/mg_item_2.csv\"),\n", + " os.path.join(path,\"output/temp/mg_item_3.csv\")]\n", + "\n", + "all_df = []\n", + "\n", + "for file in csv_files:\n", + " df_chunk = pd.read_csv(file, dtype=str, encoding='utf-8',\n", + " usecols=['ano','id_municipio','orgao','id_unidade_gestora',\n", + " 'id_licitacao_bd','id_licitacao','id_dispensa',\n", + " 'nome_vencedor','documento'], chunksize=chunksize)\n", + " chunk_list = []\n", + " for chunk in df_chunk:\n", + " chunk_list.append(chunk)\n", + " df = pd.concat(chunk_list)\n", + " all_df.append(df)\n", + "\n", + "licitacao_item_mg = pd.concat(all_df, ignore_index=True, sort=True)\n", + "\n", + "# Keep unique winners\n", + "\n", + "licitacao_item_mg.drop_duplicates(subset=['documento','nome_vencedor','id_licitacao_bd','id_municipio','ano','orgao'],inplace=True)\n", + "\n", + "licitacao_item_mg['vencedor']=\"1\"\n", + "\n", + "licitacao_item_mg.rename({'nome_vencedor':'razao_social'}, axis=1, inplace=True)\n", + "\n", + "# Merge to get suppliers\n", + "\n", + "mg = pd.merge(mg_habilitados, licitacao_item_mg, how='outer',\n", + " left_on=['ano','id_municipio','id_licitacao_bd','id_licitacao',\n", + " 'orgao', 'id_unidade_gestora','documento','razao_social'],\n", + " right_on=['ano','id_municipio','id_licitacao_bd','id_licitacao',\n", + " 'orgao','id_unidade_gestora','documento','razao_social'], indicator=True)\n", + "\n", + "#both 898133\n", + "#right_only 327541\n", + "#left_only 126115\n", + "\n", + "mg['vencedor'] = mg['vencedor'].replace(np.nan,\"0\")\n", + "\n", + "# Create type variable according to document characters\n", + "# 1 if cnpj\n", + "# 2 if cpf\n", + "\n", + "mg['documento'] = mg['documento'].str.replace(\"-\",\"\")\n", + "mg['documento'] = mg['documento'].str.replace(\".\",\"\")\n", + "mg['documento']=mg['documento'].str.strip()\n", + "\n", + "mg['length'] = mg['documento'].str.len()\n", + "\n", + "conditions = [\n", + " (mg['length'] == 14),\n", + " (mg['length'] == 11),\n", + " (mg['length'] != 11) | (mg['length'] != 14)\n", + "]\n", + "categories = ['1', '2', '']\n", + "mg['tipo'] = np.select(conditions, categories)\n", + "\n", + "mg['tipo'] = mg['tipo'].str.replace(\"0\",\"\")\n", + "\n", + "# Format\n", + "mg['ano'] = mg['ano'].astype(int)\n", + "\n", + "# Assign state acronym to the 'sigla_uf'\n", + "mg['sigla_uf'] = \"MG\"\n", + "\n", + "# Keep only non-missing information\n", + "mg = mg[(mg['documento'].notnull()) & (mg['razao_social'].notnull())]\n", + "\n", + "# Reorder columns\n", + "mg = mg.reindex(columns=ordem)\n", + "\n", + "# Save\n", + "mg.to_csv(os.path.join(path, \"output/licitacao_participante_mg.csv\"), index=False, na_rep='', float_format='%.2f')" + ], + "metadata": { + "id": "JGYB-_Kyjb1I" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# PR" + ], + "metadata": { + "id": "vF3m1TEK6LMG" + } + }, + { + "cell_type": "code", + "source": [ + "#PR\n", + "\n", + "# List municipalities to loop\n", + "\n", + "municipio_pr = municipio.query('sigla_uf==\"PR\"')\n", + "\n", + "municipio_pr = municipio_pr.query('id_municipio_6 != \"411915\" & \\\n", + " id_municipio_6 != \"411370\" & \\\n", + " id_municipio_6 != \"411535\" & \\\n", + " id_municipio_6 != \"411710\" & \\\n", + " id_municipio_6 != \"412627\" & \\\n", + " id_municipio_6 != \"410140\" & \\\n", + " id_municipio_6 != \"410350\"')\n", + "\n", + "municipios_pr = municipio_pr['id_municipio_6'].tolist()\n", + "\n", + "anos_pr = ['2013','2014','2015','2016','2017','2018','2019','2020','2021']\n", + "\n", + "# Initialize an empty list and loop through each CSV file\n", + "\n", + "all_files_pr = []\n", + "for a in anos_pr:\n", + " for m in municipios_pr:\n", + " exec(\"path_lic_par = '/content/gdrive/MyDrive/ComprasPublicas_Brasil/input/PR/{}/Licitacao/{}/{}_{}_LicitacaoParticipante.csv'\".format(a, m, a, m))\n", + " all_files_pr.append(path)\n", + "\n", + "# Identify file origin\n", + "all_df_pr = []\n", + "for f in all_files_pr:\n", + " df1 = pd.read_csv(f, sep=',', encoding='utf-8', dtype=str)\n", + " df1['arquivo'] = f.split('/')[-1]\n", + " all_df_pr.append(df1)\n", + "\n", + "pr1 = pd.concat(all_df_pr, ignore_index=True, sort=True)\n", + "\n", + "# Drop and rename\n", + "\n", + "pr1_drop = ['DataReferencia','arquivo', 'nmEntidade', 'nrLicitacao',\n", + " 'ultimoEnvioSIMAMNesteExercicio','dtOcorrencia','nmMunicipio']\n", + "\n", + "pr1.drop(pr1_drop, axis=1, inplace=True)\n", + "\n", + "pr1_rename = {'cdIBGE':'id_municipio', 'idLicitacao':'id_licitacao', 'idEntidade':'id_unidade_gestora',\n", + " 'nrAnoLicitacao':'ano', 'nmParticipanteLicitacao':'razao_social',\n", + " 'nrDocParticipanteLicitacao':'documento','sgDocParticipanteLicitacao':'tipo'}\n", + "\n", + "pr1.rename(pr1_rename, axis=1, inplace=True)\n", + "\n", + "# Format\n", + "\n", + "pr1['tipo']=pr1['tipo'].str.strip()\n", + "pr1['tipo'] = pr1['tipo'].replace(['CNPJ','CPF'],['1','2'])\n", + "\n", + "# Reshape - long to wide\n", + "\n", + "classificados_pr = pr1[(pr1['dsTipoSituacaoParticipante'] == 'Classificado') | (pr1['dsTipoSituacaoParticipante'] == 'Desclassificado') ]\n", + "\n", + "classificado = {'Classificado':'1', 'Desclassificado':'0'}\n", + "classificados_pr['classificado'] = classificados_pr['dsTipoSituacaoParticipante'].map(classificado)\n", + "classificados_pr.drop('dsTipoSituacaoParticipante', axis=1, inplace=True)\n", + "\n", + "habilitados_pr = pr1[(pr1['dsTipoSituacaoParticipante'] == 'Habilitado') | (pr1['dsTipoSituacaoParticipante'] == 'Desabilitado') ]\n", + "habilitado = {'Habilitado':'1', 'Desabilitado':'0'}\n", + "habilitados_pr['habilitado'] = habilitados_pr['dsTipoSituacaoParticipante'].map(habilitado)\n", + "habilitados_pr.drop('dsTipoSituacaoParticipante', axis=1, inplace=True)\n", + "\n", + "# Merge classificados and habilitados\n", + "pr = pd.merge(classificados_pr, habilitados_pr, how='outer',\n", + " left_on=['id_licitacao', 'documento', 'ano', 'id_municipio','id_unidade_gestora','razao_social','tipo'],\n", + " right_on=['id_licitacao', 'documento', 'ano', 'id_municipio','id_unidade_gestora','razao_social','tipo'])\n", + "\n", + "# Participants level to tender level\n", + "# Deal with repetead rows for some participants\n", + "pr = pr.drop_duplicates(subset=['id_municipio','id_unidade_gestora','id_licitacao','ano','documento'])\n", + "\n", + "# Format\n", + "pr['documento'] = pr['documento'].str.replace(\"-\",\"\")\n", + "pr['documento'] = pr['documento'].str.replace(\".\",\"\")\n", + "pr['documento']= pr['documento'].str.strip()\n", + "\n", + "# Merge id_municipio 6 and 7 digits id\n", + "\n", + "pr['id_municipio'] = pr['id_municipio'].astype('string')\n", + "\n", + "pr = pd.merge(pr, municipio, how='left', left_on='id_municipio', right_on='id_municipio_6')\n", + "\n", + "pr.drop(['id_municipio_x','id_municipio_6','nome'],axis=1, inplace=True)\n", + "pr.rename({'id_municipio_y':'id_municipio'},axis=1, inplace=True)\n", + "\n", + "# Create a unique identifier for each purchase\n", + "\n", + "pr['id_licitacao_bd'] = pr['id_licitacao'] + pr['id_unidade_gestora'] + pr['sigla_uf']\n", + "\n", + "# Get winner information from licitacao_item\n", + "\n", + "item_pr1 = pd.read_csv(os.path.join(\"output/temp/item_pr1.csv\"), dtype=str, encoding='utf-8',\n", + " usecols=['ano','id_municipio','id_licitacao_bd','nome_vencedor','documento'])\n", + "\n", + "item_pr2 = pd.read_csv(os.path.join(\"output/temp/item_pr2.csv\"), dtype=str, encoding='utf-8',\n", + " usecols=['ano','id_municipio','id_licitacao_bd','nome_vencedor','documento'])\n", + "\n", + "item_pr3 = pd.read_csv(os.path.join(\"output/temp/item_pr3.csv\"), dtype=str, encoding='utf-8',\n", + " usecols=['ano','id_municipio','id_licitacao_bd','nome_vencedor','documento'])\n", + "\n", + "item_pr = item_pr1.append([item_pr2,item_pr3],ignore_index=True)\n", + "\n", + "# Make each supplier only appear once\n", + "\n", + "item_pr.drop_duplicates(subset=['documento','nome_vencedor','id_licitacao_bd','id_municipio','ano'],inplace=True)\n", + "\n", + "item_pr['vencedor']=\"1\"\n", + "\n", + "item_pr.rename({'nome_vencedor':'razao_social'},axis=1, inplace=True)\n", + "\n", + "# Then Merge\n", + "pr = pd.merge(pr, item_pr, how='left', left_on=['ano','id_municipio', 'id_licitacao_bd','razao_social','documento'],\n", + " right_on=['ano','id_municipio', 'id_licitacao_bd','razao_social','documento'], indicator=True)\n", + "\n", + "# Participants who are not in licitacao_item, are not winners\n", + "pr['vencedor'] = pr['vencedor'].replace(np.nan,\"0\")\n", + "\n", + "# Reorder columns\n", + "pr = pr.reindex(columns=ordem)\n", + "\n", + "# Save\n", + "pr.to_csv(os.path.join(path, \"output/licitacao_participante_pr.csv\"), index=False, na_rep='', float_format='%.2f')" + ], + "metadata": { + "id": "oSejZi16kVnt" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# RS" + ], + "metadata": { + "id": "wJQK-VwCmh7T" + } + }, + { + "cell_type": "code", + "source": [ + "# List municipalities to loop\n", + "\n", + "municipio_rs = municipio.query('sigla_uf==\"RS\"')\n", + "municipios_rs = municipio_rs['id_municipio'].tolist()\n", + "\n", + "# Rename and Drop variables\n", + "\n", + "rs_rename = {'ANO_LICITACAO':'ano','NR_DOCUMENTO':'documento','NR_LICITACAO':'id_licitacao',\n", + " 'TP_RESULTADO_HABILITACAO':'habilitado','TP_DOCUMENTO':'tipo','CD_TIPO_MODALIDADE':'modalidade'}\n", + "\n", + "rs_drop = ['BL_BENEFICIO_MICRO_EPP','TP_DOCUMENTO.1','TP_CONDICAO','NR_DOCUMENTO.1']" + ], + "metadata": { + "id": "WAh7RJPWDot5" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "anos_rs = ['2016','2017','2018','2019','2020','2021']\n", + "\n", + "# RS folder\n", + "pasta = os.path.join(path,\"input/RS\")\n", + "\n", + "all_df_rs=[]\n", + "for a in anos_rs:\n", + " df = os.path.join(pasta, 'Licitacao/{}.csv.zip'.format(a,a))\n", + " with ZipFile(df) as z:\n", + " with z.open('licitante.csv') as f:\n", + " rs = pd.read_csv(f,sep=',', encoding='utf-8', dtype=str)\n", + "\n", + " rs.drop(rs_drop, axis=1, inplace=True)\n", + "\n", + " #In PRD(Processo de Dispensa), PRI(Processo de Inexigibilidade) and\n", + " #RPO(Adesão à Ata de Registro de Preços) there is no participant registry\n", + " #For other purchase types, there is registry and any missing\n", + "\n", + " # Replace documents with non-numerical strings (usually suppliers name)\n", + " rs['NR_DOCUMENTO'] = np.where(rs['NR_DOCUMENTO'].str.isnumeric(), rs['NR_DOCUMENTO'], np.nan)\n", + "\n", + " # Adds zeros to the left, missing in some cnpjs\n", + " rs['length'] = rs['NR_DOCUMENTO'].str.len()\n", + " rs['NR_DOCUMENTO'] = np.where((rs['length'] > 11) & (rs['length']< 14), rs['NR_DOCUMENTO'].str.zfill(14), rs['NR_DOCUMENTO'])\n", + " rs.drop(['length'],axis=1,inplace=True)\n", + "\n", + " # Files'licitacao' and 'itens' have information about PRD, PRI, RPO\n", + "\n", + " # licitacao\n", + " with z.open('licitacao.csv') as f:\n", + " rs2 = pd.read_csv(f,sep=',', encoding='utf-8', dtype=str,\n", + " usecols = ['CD_ORGAO','NR_LICITACAO','ANO_LICITACAO','CD_TIPO_MODALIDADE',\n", + " 'TP_DOCUMENTO_VENCEDOR','NR_DOCUMENTO_VENCEDOR',\n", + " 'TP_DOCUMENTO_FORNECEDOR','NR_DOCUMENTO_FORNECEDOR'])\n", + "\n", + " # Many documents as missing (~67%)\n", + " #rs2['NR_DOCUMENTO_VENCEDOR'].isna() & rs2['NR_DOCUMENTO_FORNECEDOR'].isna()\n", + "\n", + " # Items files to get more participant information\n", + " with z.open('item.csv') as f:\n", + " rs3 = pd.read_csv(f,sep=',', encoding='utf-8', dtype=str, usecols=['ANO_LICITACAO','NR_LICITACAO','CD_TIPO_MODALIDADE',\n", + " 'NR_DOCUMENTO.1','NR_DOCUMENTO','CD_ORGAO',\n", + " 'TP_DOCUMENTO.1','TP_DOCUMENTO'])\n", + "\n", + " # Two different variables for document, one of them includes information when the purchase type is PRD, PRI or RPO.\n", + " # Concat both\n", + " rs3['NR_DOCUMENTO'] = np.where(rs3['NR_DOCUMENTO'].isna(), rs3['NR_DOCUMENTO.1'], rs3['NR_DOCUMENTO'])\n", + " rs3['TP_DOCUMENTO'] = np.where(rs3['TP_DOCUMENTO'].isna(), rs3['TP_DOCUMENTO.1'], rs3['TP_DOCUMENTO'])\n", + "\n", + " # Drop non necessary variables\n", + " rs3.drop(['NR_DOCUMENTO.1','TP_DOCUMENTO.1'],axis=1,inplace=True)\n", + "\n", + " # We may have a supplier winning many items within the same tender\n", + " # Drop duplicated suppliers\n", + "\n", + " rs3.drop_duplicates(subset=['NR_DOCUMENTO','CD_ORGAO','ANO_LICITACAO','CD_TIPO_MODALIDADE','NR_LICITACAO'],inplace=True)\n", + "\n", + " # Merge licitacao and items\n", + "\n", + " rs2 = pd.merge(rs2, rs3, how='left',left_on=['CD_ORGAO','NR_LICITACAO','ANO_LICITACAO','CD_TIPO_MODALIDADE'],\n", + " right_on=['CD_ORGAO','NR_LICITACAO','ANO_LICITACAO','CD_TIPO_MODALIDADE'])\n", + "\n", + " # Concat all document variables\n", + " rs2['NR_DOCUMENTO'] = np.where(rs2['NR_DOCUMENTO'].isna() & rs2['NR_DOCUMENTO_FORNECEDOR'].notnull(), rs2['NR_DOCUMENTO_FORNECEDOR'], rs2['NR_DOCUMENTO'])\n", + " rs2['NR_DOCUMENTO'] = np.where(rs2['NR_DOCUMENTO'].isna() & rs2['NR_DOCUMENTO_VENCEDOR'].notnull(), rs2['NR_DOCUMENTO_VENCEDOR'], rs2['NR_DOCUMENTO'])\n", + "\n", + " rs2['TP_DOCUMENTO'] = np.where(rs2['TP_DOCUMENTO'].isna() & rs2['TP_DOCUMENTO_FORNECEDOR'].notnull(), rs2['TP_DOCUMENTO_FORNECEDOR'], rs2['TP_DOCUMENTO'])\n", + " rs2['TP_DOCUMENTO'] = np.where(rs2['TP_DOCUMENTO'].isna() & rs2['TP_DOCUMENTO_VENCEDOR'].notnull(), rs2['TP_DOCUMENTO_VENCEDOR'], rs2['TP_DOCUMENTO'])\n", + "\n", + " rs2.rename({'NR_DOCUMENTO':'NR_DOCUMENTO_2'},axis=1,inplace=True)\n", + " rs2.rename({'TP_DOCUMENTO':'TP_DOCUMENTO_2'},axis=1,inplace=True)\n", + "\n", + " # Replace non-numerical documents (usually suppliers name)\n", + " rs2['NR_DOCUMENTO_2'] = np.where(rs2['NR_DOCUMENTO_2'].str.isnumeric(), rs2['NR_DOCUMENTO_2'], np.nan)\n", + "\n", + " # Adds zeros to the left, missing in some cnpjs\n", + " rs2['length'] = rs2['NR_DOCUMENTO_2'].str.len()\n", + " rs2['NR_DOCUMENTO_2'] = np.where((rs2['length'] > 11) & (rs2['length']< 14), rs2['NR_DOCUMENTO_2'].str.zfill(14), rs2['NR_DOCUMENTO_2'])\n", + " rs2.drop(['length'],axis=1,inplace=True)\n", + "\n", + " # Subset with PRD, PRI e RPO to merge outer\n", + " rs2_2 = rs2[(rs2['CD_TIPO_MODALIDADE']=='PRD') | (rs2['CD_TIPO_MODALIDADE']=='PRI') | (rs2['CD_TIPO_MODALIDADE']=='RPO')]\n", + "\n", + " # Merge main licitante dataframe with the above subset to get participants information for PRD,PRI,RPO\n", + " rs = pd.merge(rs, rs2_2, how='outer',left_on=['CD_ORGAO','NR_LICITACAO','ANO_LICITACAO','CD_TIPO_MODALIDADE'],\n", + " right_on=['CD_ORGAO','NR_LICITACAO','ANO_LICITACAO','CD_TIPO_MODALIDADE'])\n", + "\n", + " # Concat documents to create only one variable\n", + " rs['NR_DOCUMENTO'] = np.where(rs['NR_DOCUMENTO'].isna(), rs['NR_DOCUMENTO_2'], rs['NR_DOCUMENTO'])\n", + " rs['TP_DOCUMENTO'] = np.where(rs['TP_DOCUMENTO'].isna(), rs['TP_DOCUMENTO_2'], rs['TP_DOCUMENTO'])\n", + "\n", + " # Still ~5000 documents missings, all related to PRD, PRI, RPO.\n", + " # Will drop those observations, as they do not add aditional information here.\n", + " rs=rs.dropna(subset=['NR_DOCUMENTO'])\n", + "\n", + " rs.drop(['NR_DOCUMENTO_2','NR_DOCUMENTO_FORNECEDOR','NR_DOCUMENTO_VENCEDOR',\n", + " 'TP_DOCUMENTO_2','TP_DOCUMENTO_FORNECEDOR','TP_DOCUMENTO_VENCEDOR'],axis=1,inplace=True)\n", + "\n", + " # I still need to identify who are the winners\n", + " # Merge rs and rs2 (rs2 comes from licitacao or items, thus it only reports winners)\n", + "\n", + " rs = pd.merge(rs, rs2, how='left',left_on=['CD_ORGAO','NR_LICITACAO','ANO_LICITACAO','CD_TIPO_MODALIDADE','NR_DOCUMENTO'],\n", + " right_on=['CD_ORGAO','NR_LICITACAO','ANO_LICITACAO','CD_TIPO_MODALIDADE','NR_DOCUMENTO_2'])\n", + "\n", + " rs['vencedor'] = np.where(rs['NR_DOCUMENTO']==rs['NR_DOCUMENTO_2'],1,0)\n", + "\n", + " # Drop non necessary variables\n", + " rs.drop(['NR_DOCUMENTO_2','NR_DOCUMENTO_FORNECEDOR','NR_DOCUMENTO_VENCEDOR',\n", + " 'TP_DOCUMENTO_2','TP_DOCUMENTO_FORNECEDOR','TP_DOCUMENTO_VENCEDOR'],axis=1,inplace=True)\n", + "\n", + " # Rename\n", + " rs.rename(rs_rename, axis=1, inplace=True)\n", + "\n", + " # Open file with information about corporate name (razao_social) and other information\n", + " with z.open('pessoas.csv') as f:\n", + " rs4 = pd.read_csv(f,sep=',', encoding='utf-8', dtype=str,\n", + " usecols = ['NR_DOCUMENTO','NM_PESSOA','CEP','TP_PESSOA',\n", + " 'LOGRADOURO','NR_ENDERECO'])\n", + "\n", + " rs4.rename({'NM_PESSOA':'razao_social','NR_DOCUMENTO':'documento','CEP':'cep'},axis=1,inplace=True)\n", + "\n", + " rs4['endereco'] = rs4['LOGRADOURO'] + ',' + rs4['NR_ENDERECO']\n", + "\n", + " # Format\n", + " rs4['cep'] = rs4['cep'].astype(float)\n", + " rs4['cep'] = rs4['cep'].apply(lambda x: f'{x:.0f}')\n", + " rs4['cep'] = rs4['cep'].astype(str)\n", + " rs4['cep'] = rs4['cep'].replace(\"nan\", np.nan)\n", + "\n", + " rs4.drop(['LOGRADOURO','NR_ENDERECO'],axis=1,inplace=True)\n", + "\n", + " # Replace documents with non-numerical strings (usually suppliers name)\n", + " rs4['documento'] = np.where(rs4['documento'].str.isnumeric(), rs4['documento'], np.nan)\n", + "\n", + " # Adds zeros to the left, missing in some cnpjs\n", + " rs4['length'] = rs4['documento'].str.len()\n", + " rs4['documento'] = np.where((rs4['length'] > 11) & (rs4['length']< 14), rs4['documento'].str.zfill(14), rs4['documento'])\n", + " rs4.drop(['length'],axis=1,inplace=True)\n", + "\n", + " # Drop duplicates\n", + " rs4.drop_duplicates(subset=['documento'],inplace=True)\n", + " rs4 = rs4.dropna(subset=['documento'])\n", + "\n", + " # Merge to get suppliers name\n", + " rs = pd.merge(rs, rs4, how='left', left_on=['documento'], right_on=['documento'])\n", + "\n", + " # Drop MAI(Manifestação de Interesse)\n", + " rs = rs.drop(rs[(rs['modalidade'] == \"MAI\")].index)\n", + "\n", + " # Recode some variables\n", + " rs['habilitado'] = rs['habilitado'].replace(['H','I','N'],['1','0','']) # N - Did not attend, exclusive to the invitation modality\n", + " rs['tipo'] = rs['tipo'].replace(['J','F','E','P'],['1','2','3',''])\n", + " rs['tipo'] = np.where((rs['documento']==\"05996565000194\") & (rs['tipo']==\"3\"),\"1\",rs['tipo'])\n", + "\n", + " rs['sigla_uf'] = \"RS\"\n", + "\n", + " # Create a unique identifier for each purchase\n", + " rs['id_licitacao_bd'] = rs['id_licitacao'] + rs['ano'] + rs['modalidade'] + rs['CD_ORGAO'] + rs['sigla_uf']\n", + "\n", + " # Merge to get id_municipio\n", + "\n", + " rs = pd.merge(rs, orgao_municipio, how='left',left_on='CD_ORGAO', right_on='CD_ORGAO', indicator=True)\n", + "\n", + " rs.drop(['NOME_ORGAO','ESFERA','SIGLA_ORGAO','SETOR_GOVERNAMENTAL','CNPJ','HOME_PAGE','NATUREZA_JURIDICA',\n", + " 'CONTABILIDADE','SITUACAO_ORGAO','CD_MUNICIPIO_TCERS','NOME_MUNICIPIO'], axis=1,inplace=True)\n", + "\n", + " rs.rename({'CD_MUNICIPIO_IBGE':'id_municipio','CD_ORGAO':'orgao'},axis=1, inplace=True)\n", + "\n", + " rs = rs[rs['_merge']==\"both\"]\n", + "\n", + " rs.drop('_merge', axis=1, inplace=True)\n", + "\n", + " all_df_rs.append(rs)\n", + "\n", + "rs = pd.concat(all_df_rs, ignore_index=True, sort=True)\n", + "\n", + "# Drop duplicates in all rows (0.01% obs)\n", + "rs=rs.drop_duplicates()\n", + "\n", + "# Reorder columns\n", + "rs = rs.reindex(columns=ordem)\n", + "\n", + "# Save\n", + "rs.to_csv(os.path.join(path, \"output/licitacao_rs.csv\"), index=False, na_rep='', float_format='%.2f')" + ], + "metadata": { + "id": "aOsnsvIjUSqI" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# PB" + ], + "metadata": { + "id": "cMBjj7Sc_TK1" + } + }, + { + "cell_type": "code", + "source": [ + "# PB\n", + "\n", + "# Open files\n", + "\n", + "pb = pd.read_csv(os.path.join(path, \"input/PB/TCE-PB-Portal-Gestor-Licitacoes_Propostas.txt\"), sep=\"|\", encoding='utf-8', dtype=str)\n", + "\n", + "pb_drop = ['jurisdicionado_id','nome_jurisdicionado','nome_tipo_jurisdicionado','nome_tipo_administracao_jurisdicionado','nome_esfera_jurisdicionado',\n", + " 'nome_modalidade_licitacao','nome_setor_atual_licitacao','url','objeto_licitacao','valor_estimado_licitacao','valor_proposta',\t'situacao_proposta',\n", + " 'valor_licitado_licitacao','nome_estagio_processual_licitacao','situacao_fracassada_licitacao','situacao_proposta','data_homologacao_licitacao']\n", + "\n", + "pb_rename = {'ano_homologacao_licitacao':'ano','cd_ugestora':'id_unidade_gestora',\n", + " 'cpf_cnpj_proponente':'documento', 'nome_proponente':'razao_social','protocolo_licitacao':'id_licitacao'}\n", + "\n", + "pb.rename(pb_rename, axis=1, inplace=True)\n", + "\n", + "#print(pb['nome_municipio'].isna().sum())\n", + "#50298 missing values, consórcios intermunicipais regionais: ex - Consórcio Intermunicipal de Saúde dos Municípios do Alto Sertão Paraibano\n", + "#or state level\n", + "\n", + "pb = pb.dropna(subset=['nome_municipio'])\n", + "\n", + "# Extract year from numero_licitacao\n", + "pb['ano'] = pb['numero_licitacao'].str[6:10]\n", + "pb['ano'] = pb['ano'].str.replace(\"2104\",\"2014\")\n", + "pb['ano'] = pb['ano'].str.replace(\"3014\",\"2014\")\n", + "\n", + "pb['sigla_uf'] = \"PB\"\n", + "\n", + "# Assign winner\n", + "pb['vencedor'] = np.where(pb['situacao_proposta']==\"Vencedora\", \"1\",\"0\")\n", + "\n", + "# Drop non-necessary variables\n", + "pb.drop(pb_drop, axis=1, inplace=True)\n", + "\n", + "# Merge to get id_municipio\n", + "pb['nome_municipio'] = pb['nome_municipio'].str.replace('Santa Terezinha','Santa Teresinha')\n", + "pb['nome_municipio'] = pb['nome_municipio'].str.replace('Quixaba','Quixabá')\n", + "\n", + "pb = pd.merge(pb, municipio, how='left', left_on=['nome_municipio','sigla_uf'], right_on=['nome','sigla_uf'], indicator = True)\n", + "pb.drop(['nome','nome_municipio','id_municipio_6','id_municipio_tce'],axis=1, inplace=True)\n", + "\n", + "# Format\n", + "pb['id_licitacao'] = pb['id_licitacao'].str[5:]\n", + "pb['id_licitacao'] = pb['id_licitacao'].str.replace('/','')\n", + "pb['ano'] = pb['ano'].astype(int)\n", + "\n", + "# Create a unique identifier for each purchase\n", + "pb['id_licitacao_bd'] = pb['id_licitacao'] + pb['id_unidade_gestora'] + pb['sigla_uf']\n", + "\n", + "# Assign participant type (1- CNPJ, 2- CPF)\n", + "pb['length'] = pb['documento'].str.len()\n", + "pb['tipo'] = np.where((pb['length'] == 14), \"1\", \"2\")\n", + "\n", + "# Drop years after 2021\n", + "pb['ano'] = pb['ano'].astype(int)\n", + "pb = pb[pb['ano'] < 2022]\n", + "\n", + "# Duplicates drop (0.11%, only changes on proposal value)\n", + "pb = pb.drop_duplicates(['id_licitacao_bd','razao_social','documento'])\n", + "\n", + "# Reorder columns\n", + "pb = pb.reindex(columns=ordem)\n", + "\n", + "# Save\n", + "pb.to_csv(os.path.join(path, \"output/licitacao_participante_pb.csv\"), index=False, na_rep='', float_format='%.2f')" + ], + "metadata": { + "id": "dEYqFVGB051U" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Partition" + ], + "metadata": { + "id": "u-wVR_VaBlUy" + } + }, + { + "cell_type": "code", + "source": [ + "# List of UFs\n", + "ufs = ['CE', 'PE', 'MG', 'PR', 'RS', 'PB']\n", + "\n", + "# Loop over each UF\n", + "for uf in ufs:\n", + " # Load the corresponding CSV file for the UF\n", + " file_path = f'/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao_participante_{uf.lower()}.csv'\n", + " df = pd.read_csv(file_path, dtype=str, encoding='utf-8')\n", + "\n", + " # Convert 'ano' column to integer\n", + " df['ano'] = df['ano'].astype(int)\n", + "\n", + " # Save cvs by year and state\n", + "\n", + " for ano in [*range(2012, 2022)]:\n", + " for uf in ufs:\n", + " if uf == 'CE' and ano in [*range(2009, 2022)]:\n", + " print(\"Particionando {} do CE\".format(ano))\n", + " df2 = df[df['ano'] == ano]\n", + " df2.drop(['ano', 'sigla_uf'], axis=1, inplace=True)\n", + " exec(\"df2.to_csv('/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao_participante/ano={}/sigla_uf=CE/microdados.csv', index=False, encoding='utf-8', na_rep='', float_format='%.2f')\".format(ano))\n", + " if uf == 'PE' and ano in [*range(2012, 2022)]:\n", + " print(\"Particionando {} do PE\".format(ano))\n", + " df2 = df[df['ano'] == ano]\n", + " df2.drop(['ano', 'sigla_uf'], axis=1, inplace=True)\n", + " exec(\"df2.to_csv('/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao_participante/ano={}/sigla_uf=PE/microdados.csv', index=False, encoding='utf-8', na_rep='', float_format='%.2f')\".format(ano))\n", + " if uf == 'MG' and ano in [*range(2014, 2022)]:\n", + " print(\"Particionando {} do MG\".format(ano))\n", + " df2 = df[df['ano'] == ano]\n", + " df2.drop(['ano', 'sigla_uf'], axis=1, inplace=True)\n", + " exec(\"df2.to_csv('/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao_participante/ano={}/sigla_uf=MG/microdados.csv', index=False, encoding='utf-8', na_rep='', float_format='%.2f')\".format(ano))\n", + " if uf == 'PR' and ano in [*range(2013, 2022)]:\n", + " print(\"Particionando {} do PR\".format(ano))\n", + " df2 = df[df['ano'] == ano]\n", + " df2.drop(['ano', 'sigla_uf'], axis=1, inplace=True)\n", + " exec(\"df2.to_csv('/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao_participante/ano={}/sigla_uf=PR/microdados.csv', index=False, encoding='utf-8', na_rep='', float_format='%.2f')\".format(ano))\n", + " if uf == 'RS' and ano in [*range(2016, 2022)]:\n", + " print(\"Particionando {} do RS\".format(ano))\n", + " df2 = df[df['ano'] == ano]\n", + " df2.drop(['ano', 'sigla_uf'], axis=1, inplace=True)\n", + " exec(\"df2.to_csv('/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao_participante/ano={}/sigla_uf=RS/microdados.csv', index=False, encoding='utf-8', na_rep='', float_format='%.2f')\".format(ano))\n", + " if uf == 'PB' and ano in [*range(2014, 2022)]:\n", + " print(\"Particionando {} do PB\".format(ano))\n", + " df2 = df[df['ano'] == ano]\n", + " df2.drop(['ano', 'sigla_uf'], axis=1, inplace=True)\n", + " exec(\"df2.to_csv('/content/gdrive/MyDrive/ComprasPublicas_Brasil/output/licitacao_participante/ano={}/sigla_uf=PB/microdados.csv', index=False, encoding='utf-8', na_rep='', float_format='%.2f')\".format(ano))" + ], + "metadata": { + "id": "Tc8v7SrVIkyj" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file