-
Notifications
You must be signed in to change notification settings - Fork 1
/
3-download-images.py
80 lines (67 loc) · 2.45 KB
/
3-download-images.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# BASC Macrochan Scraper: 2-list-image-urls.py
#
# This script uses the file `img_ids.txt` to check
# Macrochan's image views, so the image URLs (with the
# all important filename extension) are saved to
# `img_urls.txt`.
#
# Because Macrochan's HTML pages utilize Javascript to
# initiate queries, the Ghost.js headless browser is
# required to generate HTML views. It is also useful for
# gathering data with Javascript parsing.
#
# Dependencies:
# pip install bs4
# pip install Ghost.py
import json
import os
import time
from ghost import Ghost
# save bandwidth by not loading images through ghost
ghost = Ghost(download_images=False)
def mkdirs(path):
"""Make directory, if it doesn't exist."""
if not os.path.exists(path):
os.makedirs(path)
def download_file(local_filename, url, clobber=False):
"""Download the given file. Clobber overwrites file if exists."""
dir_name = os.path.dirname(local_filename)
mkdirs(dir_name)
if clobber or not os.path.exists(local_filename):
i = requests.get(url)
# if not exists
if i.status_code == 404:
print('Failed to download file:', local_filename, url)
return False
# write out in 1MB chunks
chunk_size_in_bytes = 1024*1024 # 1MB
with open(local_filename, 'wb') as local_file:
for chunk in i.iter_content(chunk_size=chunk_size_in_bytes):
local_file.write(chunk)
return True
# Make a list of all image download links using img_ids
if __name__ == '__main__':
# default parameters
workdir = os.path.join(os.getcwd(), "macrochan-dump-" + time.strftime('%Y-%m-%d')) # labeled with today's date
mkdirs(workdir) # ensure that the workdir exists
site_url = "https://macrochan.org/search.php?&offset=%s"
view_url = "https://macrochan.org/view.php?u=%s"
img_down_url = "https://macrochan.org/images/%s/%s/%s"
id_fname = 'img_ids.txt' # filename of img_ids
img_url_fname = 'img_urls.txt' # filename of img_urls
img_amt = sys.argv[1] # image amount is first argument
delay = 5 # currently set to 5 seconds by default
offset = 20
# create a new file to store img download urls
with open(img_url_fname, 'w') as url_file:
url_file.write("")
# read each img_id from the img_ids text file
with open(img_url_fname, 'r') as f:
for img_url in f:
# 3a. Use requests to download images (dump to `<1st-char>/<2nd-char>/`)
download_file(img_url)
# delay before next iteration
print("Waiting for " + delay + " seconds...")
time.sleep(delay)