-
Notifications
You must be signed in to change notification settings - Fork 1
/
2-get-images.py
152 lines (127 loc) · 4.98 KB
/
2-get-images.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# BASC Macrochan Scraper: 2-list-image-urls.py
#
# This script uses the file `img_ids.txt` to check
# Macrochan's image views, so the image URLs (with the
# all important filename extension) are saved to
# `img_urls.txt`.
#
# Because Macrochan's HTML pages utilize Javascript to
# initiate queries, the Ghost.js headless browser is
# required to generate HTML views. It is also useful for
# gathering data with Javascript parsing.
#
# Dependencies:
# pip install bs4
# pip install Ghost.py
import json
import os
import time
import sqlite3
import re
import sys
from urllib.parse import urlparse
from urllib.parse import parse_qs
from robobrowser import RoboBrowser
# our own libraries
from utils import *
# Make a list of all image download links using img_ids
def main():
# default parameters
workdir = os.path.join(os.getcwd(), "macrochan-dump")
mkdirs(workdir) # ensure that the workdir exists
site_url = "https://macrochan.org/search.php?&offset=%s"
view_url = "https://macrochan.org/view.php?u=%s"
macrochan_url = "https://macrochan.org"
img_down_url = "https://macrochan.org/images/%s/%s/%s"
db_fname = os.path.join(workdir, 'macrochan.db' ) # filename of database
delay = 5 # currently set to 5 seconds by default
offset = 20
# create a robot browser
browser = RoboBrowser()
# connect to the database to store image
conn = sqlite3.connect(db_fname)
c = conn.cursor()
# sql query to obtain all imageids from database sorted by rowid, store as list for later
c.execute('SELECT imageid FROM images ORDER BY rowid')
data = c.fetchall()
# determine amount of rows in table, and calculate where to stop
# should be 0 for empty database
c.execute('SELECT COUNT(*) FROM images')
count = c.fetchall()
row_amt = count[0][0]
print("Table 'images' has {} rows.".format(row_amt))
stop = row_amt
# determine amount of rows in table with imageext, and calculate where to start
# should be 0 at beginning
c.execute('SELECT COUNT(*) FROM images WHERE imageext IS NOT NULL')
count = c.fetchall()
row_amt = count[0][0]
print("Starting at {} on table 'images'.".format(row_amt + 1))
start = row_amt
# read each img_id from the database
for index in range(start, stop):
# SQL queries are stored in tuples of tuples, only need first value of each query
img_id = data[index][0]
# set URL by img_id
url = view_url % img_id
# inform user of progress
print("Obtaining Image Download URL # {}: {}".format(index + 1, url))
# open the webpage, check for connection error
try:
browser.open(url)
except requests.exceptions.RequestException as e:
print("Unable to connect to Macrochan, restore your internet connection.")
print("Run this script again to continue where you left off.")
sys.exit(1)
# beautifulsoup - find first <img src=> HTML tag of main image to obtain file extension
img_url = macrochan_url + browser.find('img').get('src', '/')
img_ext = os.path.splitext(img_url)[1]
# beautifulsoup - find all <a href=> HTML tags that contain image tags
tags = []
tag_regex = re.compile(r"tags")
for anchor in browser.find_all('a'):
this_tag = anchor.get('href', '/')
if (re.search(tag_regex, this_tag)):
# extract tag strings from tag urls
# http://macrochan.org/search.php?tags=Motivational+Poster
tags.append(parse_qs(urlparse(this_tag).query)['tags'][0])
# download the images
# save images to folder `macrochan-dump-*/images/<1st-char>/<2nd-char>/<image-id>.ext`
img_filename = os.path.join(workdir, "images", img_id[:1], img_id[1:2], img_id + img_ext)
try:
download_file(img_filename, img_url, clobber=True)
except requests.exceptions.RequestException as e:
print("Unable to connect to Macrochan, restore your internet connection.")
print("Run this script again to continue where you left off.")
sys.exit(1)
# add img_ext and img_url to current img_id in database
list = [img_ext, img_url, img_id]
c.execute("""UPDATE images SET imageext = ?, imageurl = ? WHERE imageid = ?""", list)
# insert tag data into database
# OR IGNORE used to avoid duplicating tags
for tag in tags:
c.execute('INSERT OR IGNORE INTO tags VALUES (?)', [tag])
# link current images to many tags
# OR IGNORE used to avoid duplicating taglinks
for tag in tags:
c.execute('INSERT OR IGNORE INTO taglink (imageid, tagname) VALUES (?,?)', [img_id, tag])
# display all linking table data for current image entry
for row in c.execute('SELECT imageid, tagname FROM taglink WHERE imageid = ? ORDER BY tagname', [img_id]):
print(row)
# Save (commit) the database changes
conn.commit()
# delay before next iteration
print("Waiting for %d seconds..." % delay)
time.sleep(delay)
# close sqlite database once finished
conn.close()
if __name__ == '__main__':
# retry loop: in case of failure or timeout, wait 60 sec. and try again
# by default, try 5 times
for i in range(1, 5):
print("Attempt {}:".format(i))
main()
print("Restarting script in {} seconds...".format())
time.delay(60)