-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_v1.4.py
126 lines (106 loc) · 5.06 KB
/
scrape_v1.4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import requests
import re
import time
import random
import sys
import os
from tqdm import tqdm
logo= """
┏┓┏┓┳┓┏┓•┏┓┏┓┳┓ ┓ ┏┓
┗┓┃ ┣┫┣┫┓┃┃┣ ┣┫ ┓┏┃ ┃┃
┗┛┗┛┛┗┛┗┗┣┛┗┛┛┗ ┗┛┻•┗╋
By Kallamamran
"""
print(logo)
#----------------------------------------------------------------------------------
# Function to download images with a progress bar
def download_image(image_url, filename, current, total):
try:
# Send a GET request to the image URL
response = requests.get(image_url, stream=True, allow_redirects=True)
# Check if the request was successful
if response.status_code == 200:
# Get the total file size from headers
file_size = int(response.headers.get('content-length', 0))
# Open the file in binary write mode
with open(filename, 'wb') as file, tqdm(
desc=f"Downloading image {current}/{total}",
total=file_size,
unit='B',
unit_scale=True,
unit_divisor=1024,
) as bar:
# Download the file in chunks
for chunk in response.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
file.write(chunk)
# Update the progress bar
bar.update(len(chunk))
return True
except requests.RequestException as e:
print(f"Failed to download image {filename}. Error: {e}")
return False
#----------------------------------------------------------------------------------
search_term = input('Enter the search term for the images: ')
desired_image_count = int(input('Enter the number of images you want to download: '))
# Get input from user and use a default value if input is blank
img_per_page_input = input('Enter the amount of images you want the scraper to load per page (leave blank to use 20): ')
img_per_page = int(img_per_page_input) if img_per_page_input.isdigit() else 20
download_location = input('Enter foldername you want to create for your downloads (leave blank to use your search term): ')
# If download location is left blank, use the search term as the folder name
if not download_location:
download_location = search_term
# Ensure download location ends with a slash and exists
download_location = download_location.rstrip('/') + '/'
if download_location and not os.path.exists(download_location):
os.makedirs(download_location)
downloaded_images = 0
current_page = 1
errors = 0
max_errors = 10
try:
while downloaded_images < desired_image_count:
# Fetch the search results page
api_url = f'https://unsplash.com/napi/search/photos?query={search_term}&page={current_page}&per_page=20'
response = requests.get(api_url)
if response.status_code == 200:
images_data = response.json()
# If no more images are found, break the loop
if not images_data['results']:
print('No more images to download.')
break
for photo in images_data['results']:
if downloaded_images >= desired_image_count:
break
photo_id = photo['id']
filename = os.path.join(download_location, f'{search_term}_{photo_id}.jpg')
# Only attempt to download if the file doesn't already exist
if not os.path.exists(filename):
download_url = photo['links']['download']
if download_image(download_url, filename, downloaded_images + 1, desired_image_count):
downloaded_images += 1
errors = 0 # Reset the error count after a successful download
else:
errors += 1
if errors >= max_errors:
print("Too many consecutive errors, stopping the script.")
break # Breaking out of the inner loop
else:
print(f"Image already exists: {filename}")
continue # Skip to the next photo
if errors >= max_errors:
break # Breaking out of the outer loop if max errors reached
current_page += 1
# Random delay between 1.0 and 5.0 seconds
time.sleep(random.uniform(1.0, 2.0))
else:
print(f"Failed to retrieve page {current_page}: Status code {response.status_code}")
errors += 1
if errors >= max_errors:
print("Too many consecutive errors, stopping the script.")
break # Breaking out of the outer loop
# Random delay before retrying
time.sleep(random.uniform(1.0, 2.0))
except KeyboardInterrupt:
print('\nScript interrupted by user. Exiting gracefully.')
sys.exit(0)