Skip to content

Commit

Permalink
adiciona check único de links extraidos
Browse files Browse the repository at this point in the history
  • Loading branch information
folhesgabriel committed Oct 25, 2023
1 parent 2e491c4 commit 640247f
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 28 deletions.
15 changes: 4 additions & 11 deletions pipelines/datasets/br_ons_avaliacao_operacao/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,12 +137,10 @@ def download_data(
Args:
path (str): the path to store the data
url (str): the table URL from ONS website.
table_name (str): the table name is the original name of the zip file with raw data from comex stat website
table_name (str): the table name is the original name
"""
# selects a url given a table name

# log(f"Downloading data from {url}")

# downloads the file and saves it
wget.download(url, out=path + table_name + "/input")
# just for precaution,
Expand All @@ -153,29 +151,24 @@ def download_data(
def crawler_ons(
url: str,
) -> List[str]:
"""this function extract all download links from bcb agencias website
"""this function extract all download links from ONS website
Args:
url (str): bcb url https://www.bcb.gov.br/fis/info/agencias.asp?frame=1
Returns:
list: a list of file links
"""
# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content of the response using lxml
html = response.text

# Parse the HTML code
soup = BeautifulSoup(html, "html.parser")

# Find all 'a' elements with href containing ".csv"
csv_links = soup.find_all("a", href=lambda href: href and href.endswith(".csv"))

# Extract the href attribute from the csv_links
csv_urls = [link["href"] for link in csv_links]
# Print the csv_urls
# print(csv_urls)
# Filtra valores únicos
csv_urls = list(set(csv_urls))

return csv_urls

Expand Down
24 changes: 7 additions & 17 deletions pipelines/datasets/br_ons_estimativa_custos/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,50 +131,40 @@ def parse_year_or_year_month(url: str) -> datetime:
def crawler_ons(
url: str,
) -> List[str]:
"""this function extract all download links from bcb agencias website
"""this function extract all download links from ONS website
Args:
url (str): bcb url https://www.bcb.gov.br/fis/info/agencias.asp?frame=1
Returns:
list: a list of file links
"""
# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content of the response using lxml
html = response.text

# Parse the HTML code
soup = BeautifulSoup(html, "html.parser")

# Find all 'a' elements with href containing ".csv"
csv_links = soup.find_all("a", href=lambda href: href and href.endswith(".csv"))

# Extract the href attribute from the csv_links
table_urls = [link["href"] for link in csv_links]
# table_names = [name["a"] for name in table_urls]
csv_urls = [link["href"] for link in csv_links]
# Filtra valores únicos
csv_urls = list(set(csv_urls))

# Print the csv_urls
print(table_urls)
# print(table_names)
return table_urls
return csv_urls


def download_data(
path: str,
url: str,
table_name: str,
) -> str:
"""A simple crawler to download data from ONS website.
"""A simple function to download data from ONS website.
Args:
path (str): the path to store the data
url (str): the table URL from ONS website.
table_name (str): the table name is the original name of the zip file with raw data from comex stat website
table_name (str): the table name
"""
# selects a url given a table name

# log(f"Downloading data from {url}")

# downloads the file and saves it
wget.download(url, out=path + table_name + "/input")
Expand Down

0 comments on commit 640247f

Please sign in to comment.