Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BugFix] br_rf_cno #897

Merged
merged 3 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pipelines/datasets/br_rf_cno/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@

class constants(Enum):

URL = "http://dadosabertos.rfb.gov.br/CNO/cno.zip"
URL_FTP = "http://dadosabertos.rfb.gov.br/CNO"
URL = "https://arquivos.receitafederal.gov.br/dados/cno/cno.zip"
URL_FTP = "https://arquivos.receitafederal.gov.br/dados/cno/"

TABLES_RENAME = {
'cno.csv': 'microdados',
Expand Down
58 changes: 45 additions & 13 deletions pipelines/datasets/br_rf_cno/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,39 +19,71 @@
from pipelines.utils.utils import log


#NOTE: O crawler falhará se o nome do arquivo mudar.
@task
def check_need_for_update(url: str) -> str:
"""
Checks the need for an update by extracting the most recent update date from the CNO FTP.
Checks the need for an update by extracting the most recent update date for 'cno.zip' from the CNO FTP.

Args:
url (str): The URL of the CNO FTP site.

Returns:
str: The date of the last update in the original source in 'YYYY-MM-DD' format.
str: The date of the last update in 'YYYY-MM-DD' format.

Raises:
requests.HTTPError: If there is an HTTP error when making the request.
ValueError: If the last update date is not found in the URL.
ValueError: If the file 'cno.zip' is not found in the URL.
"""
log('---- Extracting most recent update date from CNO FTP')
response = requests.get(url)


response = requests.get(url)
if response.status_code != 200:
raise requests.HTTPError(f"HTTP error occurred: Status code {response.status_code}")


soup = BeautifulSoup(response.content, 'html.parser')
element = soup.select_one('table tr:nth-of-type(4) td:nth-of-type(3)')
rows = soup.find_all('tr')


max_file_date = None

# A lógica é simples: processa cada 'table data' (td) de cada linha 'tr'
for row in rows:
cells = row.find_all('td')


if len(cells) < 4:
continue


link = cells[1].find('a')
if not link:
continue

name = link.get_text(strip=True)
if name != "cno.zip":
continue


date = cells[2].get_text(strip=True)
max_file_date = datetime.strptime(date, "%Y-%m-%d %H:%M").strftime("%Y-%m-%d")
break

if not max_file_date:
raise ValueError("File 'cno.zip' not found on the FTP site. Check the api endpoint: https://arquivos.receitafederal.gov.br/dados/cno/ to see folder structure or file name has changed")

log(f"---- Most recent update date for 'cno.zip': {max_file_date}")

return max_file_date






if element:
date_text = element.get_text(strip=True)
date_obj = datetime.strptime(date_text, "%Y-%m-%d %H:%M")
formatted_date = date_obj.strftime("%Y-%m-%d")
log(f'---- The last update in original source occurred in: {formatted_date}')
return formatted_date

else:
raise ValueError(f"The Last update data was not found in --- URL {url}. The website HTML code might have changed")


@task
Expand Down
Loading