Skip to content

Commit

Permalink
Merge branch 'main' into staging/br_anatel_telefonia
Browse files Browse the repository at this point in the history
  • Loading branch information
mergify[bot] authored Nov 6, 2023
2 parents 5d72ca6 + 6a842f4 commit 46dfea3
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 2 deletions.
4 changes: 2 additions & 2 deletions pipelines/datasets/br_mercadolivre_ofertas/flows.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
)

with Flow(
name="br_mercadolivre_ofertas.item", code_owners=["Gabs"]
name="br_mercadolivre_ofertas.item", code_owners=["Gabriel Pisa"]
) as br_mercadolivre_ofertas_item:
# Parameters
dataset_id = Parameter(
Expand Down Expand Up @@ -162,7 +162,7 @@
br_mercadolivre_ofertas_item.schedule = every_day_item

with Flow(
name="br_mercadolivre_ofertas.vendedor", code_owners=["Gabs"]
name="br_mercadolivre_ofertas.vendedor", code_owners=["Gabriel Pisa"]
) as br_mercadolivre_ofertas_vendedor:
# Parameters
dataset_id = Parameter(
Expand Down
1 change: 1 addition & 0 deletions pipelines/datasets/br_mercadolivre_ofertas/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
kwargs_list = const_mercadolivre.KWARGS_LIST.value
url_lists = {"oferta_dia": []}

# loop para pegar todas as páginas
for i in range(1, 21):
urls = {"oferta_dia": oferta_dia + str(i)}
for table, url in urls.items():
Expand Down
51 changes: 51 additions & 0 deletions pipelines/datasets/br_mercadolivre_ofertas/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,15 @@

# ! tratamento dos dados
def clean_experience(x):
"""
Cleans and extracts numeric experience data from a string.
Args:
x (str): The input string containing experience information.
Returns:
int or None: Extracted numeric experience value or None if not found.
"""
try:
result = re.findall(r"\d+", x)[0]
except Exception:
Expand Down Expand Up @@ -157,6 +166,15 @@ def get_features(soup):
# ! utilizado no processo da tabela de itens
@retry
def get_review(soup):
"""
Retrieves review information from a web page using BeautifulSoup.
Args:
soup (BeautifulSoup): The BeautifulSoup object representing the web page.
Returns:
dict: A dictionary containing review information, including stars and review count.
"""
script_elements = soup.find_all("script", type="application/ld+json")

json_data = json.loads(script_elements[0].string)
Expand Down Expand Up @@ -184,8 +202,18 @@ def get_review(soup):
# return review_info


# ! utilizado no processo da tabela de itens
@retry
def get_categories(soup):
"""
Retrieves categories from a web page using BeautifulSoup.
Args:
soup (BeautifulSoup): The BeautifulSoup object representing the web page.
Returns:
list: A list of categories extracted from the web page.
"""
script_elements = soup.find_all("script", type="application/ld+json")
categories = []
# Loop through script elements and extract the desired content
Expand Down Expand Up @@ -218,6 +246,7 @@ def get_seller_link(soup):
return seller_link


# ! utilizado no processo da tabela de itens
@retry
def get_prices(soup, **kwargs):
"""
Expand Down Expand Up @@ -417,6 +446,18 @@ def get_features_seller(soup):

# ! parte do processo da tabela de vendedor
async def get_seller_async(url, seller_id):
"""
Extracts seller qualification information from <span> elements asynchronously.
The use of 'async' indicates that this function can execute multiple tasks concurrently
without blocking the main execution thread, making it more efficient for web scraping tasks.
Args:
url (str): The URL of the seller's page.
seller_id (int): The seller's ID.
Returns:
dict: A dictionary containing seller qualification information.
"""
kwargs_list = [
{"class_": "experience"},
{"class_": "seller-info__subtitle-sales"},
Expand All @@ -441,6 +482,16 @@ async def get_seller_async(url, seller_id):

# ! processo da tabela de vendedor
async def main_seller(seller_ids, seller_links, file_dest):
"""
Process seller data asynchronously.
The use of 'async' indicates that this function can execute multiple tasks concurrently
without blocking the main execution thread, making it more efficient for web scraping tasks.
Args:
seller_ids (list): List of seller IDs.
seller_links (list): List of seller links.
file_dest (str): Destination file to save the processed seller data.
"""
# get list of unique sellers
dict_id_link = dict(zip(seller_ids, seller_links))

Expand Down

0 comments on commit 46dfea3

Please sign in to comment.