Merge branch 'main' into staging/br_anatel_telefonia

basedosdados · Nov 6, 2023 · 46dfea3 · 46dfea3
2 parents 5d72ca6 + 6a842f4
commit 46dfea3
Show file tree

Hide file tree

Showing 3 changed files with 54 additions and 2 deletions.
diff --git a/pipelines/datasets/br_mercadolivre_ofertas/flows.py b/pipelines/datasets/br_mercadolivre_ofertas/flows.py
@@ -33,7 +33,7 @@
 )
 
 with Flow(
-    name="br_mercadolivre_ofertas.item", code_owners=["Gabs"]
+    name="br_mercadolivre_ofertas.item", code_owners=["Gabriel Pisa"]
 ) as br_mercadolivre_ofertas_item:
     # Parameters
     dataset_id = Parameter(
@@ -162,7 +162,7 @@
 br_mercadolivre_ofertas_item.schedule = every_day_item
 
 with Flow(
-    name="br_mercadolivre_ofertas.vendedor", code_owners=["Gabs"]
+    name="br_mercadolivre_ofertas.vendedor", code_owners=["Gabriel Pisa"]
 ) as br_mercadolivre_ofertas_vendedor:
     # Parameters
     dataset_id = Parameter(

diff --git a/pipelines/datasets/br_mercadolivre_ofertas/tasks.py b/pipelines/datasets/br_mercadolivre_ofertas/tasks.py
@@ -29,6 +29,7 @@
 kwargs_list = const_mercadolivre.KWARGS_LIST.value
 url_lists = {"oferta_dia": []}
 
+# loop para pegar todas as páginas
 for i in range(1, 21):
     urls = {"oferta_dia": oferta_dia + str(i)}
     for table, url in urls.items():

diff --git a/pipelines/datasets/br_mercadolivre_ofertas/utils.py b/pipelines/datasets/br_mercadolivre_ofertas/utils.py
@@ -22,6 +22,15 @@
 
 # ! tratamento dos dados
 def clean_experience(x):
+    """
+    Cleans and extracts numeric experience data from a string.
+
+    Args:
+        x (str): The input string containing experience information.
+
+    Returns:
+        int or None: Extracted numeric experience value or None if not found.
+    """
     try:
         result = re.findall(r"\d+", x)[0]
     except Exception:
@@ -157,6 +166,15 @@ def get_features(soup):
 # ! utilizado no processo da tabela de itens
 @retry
 def get_review(soup):
+    """
+    Retrieves review information from a web page using BeautifulSoup.
+
+    Args:
+        soup (BeautifulSoup): The BeautifulSoup object representing the web page.
+
+    Returns:
+        dict: A dictionary containing review information, including stars and review count.
+    """
     script_elements = soup.find_all("script", type="application/ld+json")
 
     json_data = json.loads(script_elements[0].string)
@@ -184,8 +202,18 @@ def get_review(soup):
 #     return review_info
 
 
+# ! utilizado no processo da tabela de itens
 @retry
 def get_categories(soup):
+    """
+    Retrieves categories from a web page using BeautifulSoup.
+
+    Args:
+        soup (BeautifulSoup): The BeautifulSoup object representing the web page.
+
+    Returns:
+        list: A list of categories extracted from the web page.
+    """
     script_elements = soup.find_all("script", type="application/ld+json")
     categories = []
     # Loop through script elements and extract the desired content
@@ -218,6 +246,7 @@ def get_seller_link(soup):
     return seller_link
 
 
+# ! utilizado no processo da tabela de itens
 @retry
 def get_prices(soup, **kwargs):
     """
@@ -417,6 +446,18 @@ def get_features_seller(soup):
 
 # ! parte do processo da tabela de vendedor
 async def get_seller_async(url, seller_id):
+    """
+    Extracts seller qualification information from <span> elements asynchronously.
+    The use of 'async' indicates that this function can execute multiple tasks concurrently
+    without blocking the main execution thread, making it more efficient for web scraping tasks.
+
+    Args:
+        url (str): The URL of the seller's page.
+        seller_id (int): The seller's ID.
+
+    Returns:
+        dict: A dictionary containing seller qualification information.
+    """
     kwargs_list = [
         {"class_": "experience"},
         {"class_": "seller-info__subtitle-sales"},
@@ -441,6 +482,16 @@ async def get_seller_async(url, seller_id):
 
 # ! processo da tabela de vendedor
 async def main_seller(seller_ids, seller_links, file_dest):
+    """
+    Process seller data asynchronously.
+    The use of 'async' indicates that this function can execute multiple tasks concurrently
+    without blocking the main execution thread, making it more efficient for web scraping tasks.
+
+    Args:
+        seller_ids (list): List of seller IDs.
+        seller_links (list): List of seller links.
+        file_dest (str): Destination file to save the processed seller data.
+    """
     # get list of unique sellers
     dict_id_link = dict(zip(seller_ids, seller_links))