-
Notifications
You must be signed in to change notification settings - Fork 0
/
beach_scrapper.py
145 lines (116 loc) · 4.92 KB
/
beach_scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import re
import time
import pandas as pd
from typing import List
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.remote.webdriver import WebDriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
def navigate_to_url(driver: WebDriver, url: str, wait_time: int = 10) -> None:
"""
Navigates the webdriver to a specified URL and waits until the page is loaded.
Parameters:
driver (webdriver): The webdriver instance to use.
url (str): The URL to navigate to.
wait_time (int, optional): The maximum time to wait for the page to load. Defaults to 10.
"""
try:
driver.get(url)
WebDriverWait(driver, wait_time).until(
EC.presence_of_element_located((By.TAG_NAME, 'body')))
except TimeoutException:
raise RuntimeError(
f"Timeout error while navigating to {url}. Page did not load successfully.")
except Exception as e:
raise RuntimeError(
f"An error occurred while navigating to {url}: {str(e)}")
def scroll_to_bottom(driver: WebDriver) -> None:
"""
Scrolls the webdriver to the bottom of the page.
Will keep scrolling down the page until it cannot scroll any further.
Parameters:
driver (webdriver): The webdriver instance to use.
"""
initial_page_source = driver.page_source
can_scroll = True
while can_scroll:
driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
page_source_after_scroll = driver.page_source
if initial_page_source == page_source_after_scroll:
can_scroll = False
else:
initial_page_source = page_source_after_scroll
def remove_elements(driver: WebDriver, css_selectors: List[str]) -> None:
"""
Removes elements from the webpage that match any of the provided CSS selectors.
Parameters:
driver (webdriver): The webdriver instance to use.
css_selectors (list[str]): A list of CSS selectors. Each element that matches any of these selectors will be removed.
"""
for css_selector in css_selectors:
try:
element = driver.find_element(By.CSS_SELECTOR, css_selector)
driver.execute_script("arguments[0].remove();", element)
except (NoSuchElementException, TimeoutException):
pass
ROOT_URL = 'https://pochivka.bg/plazhove-bulgaria-f120'
CSS_ELEMENTS = ["div.fixed-box.quiz", 'div.backdrop[style*="display: block;"]']
PATTERN = r'span class="map">\s*<img alt="([^"]*)"'
REMOVE_STRING = ' (плаж)'
# Options are set for driver to run in the background
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-extensions")
driver = webdriver.Chrome(options=chrome_options)
navigate_to_url(driver, ROOT_URL)
scroll_to_bottom(driver)
remove_elements(driver, CSS_ELEMENTS)
soup = BeautifulSoup(driver.page_source, "lxml")
beach_list = re.findall(PATTERN, str(soup))
beach_list = [item.replace(REMOVE_STRING, '') for item in beach_list]
beach_dict = {'beach_name': beach_list}
# Needed to locate the beach URLs in the HTML
title_divs = driver.find_elements(By.CSS_SELECTOR, "div.title")
# Gathering the URLs of every beach
url_storage = []
for i, title_div in enumerate(title_divs):
try:
inner_anchor = title_div.find_element(By.CSS_SELECTOR, "a")
href_value = inner_anchor.get_attribute("href")
url_storage.append(href_value)
except NoSuchElementException:
break
beach_dict['urls'] = url_storage
latitude_container = []
longitude_container = []
# Extracting the latitude and longitude of every beach
try:
for name_link in beach_dict['urls']:
try:
# Have to redo for every new link we visit
navigate_to_url(driver, name_link)
scroll_to_bottom(driver)
remove_elements(driver, CSS_ELEMENTS)
latitude_element = driver.find_element(
By.CSS_SELECTOR, 'meta[property="place:location:latitude"]')
longitude_element = driver.find_element(
By.CSS_SELECTOR, 'meta[property="place:location:longitude"]')
latitude_container.append(
latitude_element.get_attribute("content"))
longitude_container.append(
longitude_element.get_attribute("content"))
except TimeoutException:
raise RuntimeError(
f"Timeout error while processing {name_link}. The required elements were not found.")
finally:
driver.quit()
beach_dict.update({'latitude': latitude_container,
'longitude': longitude_container})
beach_info = pd.DataFrame(beach_dict)
beach_info.to_csv('beach_info.csv', index=True)