-
Notifications
You must be signed in to change notification settings - Fork 0
/
Produtos_Supermercados.py
80 lines (63 loc) · 3.08 KB
/
Produtos_Supermercados.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import requests
import pandas as pd
from datetime import datetime, timedelta
import io
import urllib3
from urllib3.exceptions import InsecureRequestWarning
# Lista de produtos a pesquisar
SHEET_ID = '1ZGw9064Dj07dKZbKxFUNQVBYS9fDPWhbTLwE0pfjXK4'
SHEET_NAME = 'Folha1'
url = f'https://docs.google.com/spreadsheets/d/{SHEET_ID}/gviz/tq?tqx=out:csv&sheet={SHEET_NAME}'
df = pd.read_csv(url)
keywords = df['Palavras a pesquisar'].to_list()
# Obter dados
# disable SSL certificate verification warnings
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
url = "https://content-api.prices-crawler.duckdns.org/api/v1/products/search"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0",
"Accept": "application/json, text/plain, */*",
"Accept-Language": "pt-PT,pt;q=0.8,en;q=0.5,en-US;q=0.3",
"Accept-Encoding": "gzip, deflate, br",
"Content-Type": "application/json",
"Origin": "https://prices-crawler.vercel.app",
"Connection": "keep-alive",
"Referer": "https://prices-crawler.vercel.app/",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "cross-site",
}
json_data = []
for keyword in keywords:
data = {
"catalogs": ["pt.auchan", "pt.continente", "pt.minipreco", "pt.pingo-doce"],
"query": keyword,
}
response = requests.post(url, headers=headers, json=data, verify=False)
if response.ok:
response_data = response.json()
# append a dictionary with the keyword value to json_data
json_data.append({"keyword": keyword, "response": response_data})
else:
print(f"API request failed with status code {response.status_code}")
# define the columns for the dataframe
columns = ['keyword', 'locale', 'catalog', 'id', 'reference', 'name', 'regularPrice', 'campaignPrice', 'pricePerQuantity', 'quantity', 'brand', 'description', 'productUrl', 'imageUrl', 'eanUpcList', 'date', 'data']
# create an empty list to hold each row of data
data = []
for dictionary in json_data:
response = dictionary['response']
locale = response[0]['locale']
for catalog_data in response:
catalog = catalog_data['catalog']
for product in catalog_data['products']:
# add the relevant data to the 'data' list
data.append([
dictionary['keyword'], locale, catalog, product.get('id'), product.get('reference'), product.get('name'),
product.get('regularPrice'), product.get('campaignPrice'), product.get('pricePerQuantity'), product.get('quantity'),
product.get('brand'), product.get('description'), product.get('productUrl'), product.get('imageUrl'),
product.get('eanUpcList'), product.get('date'), product.get('data')
])
# create the Pandas dataframe
df = pd.DataFrame(data, columns=columns)
# Get today date now to file name when export to csv or excel with encoding utf8
df.to_csv((datetime.now()+timedelta(hours=1)).strftime('data_sources/data_transformed/produtos_online_scrap-%Y-%m-%d.csv'), encoding='utf8', index=False)