-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
403 lines (386 loc) · 19.2 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
import requests
import argparse
import pathlib
import datetime
import html
import time
# parse command-line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--user_id",
help='User network ID',
required=True,
type=str)
args = parser.parse_args()
# need this Stack API key for a higher request quota per day.
# See also https://api.stackexchange.com/docs/authentication for details
api_key = "YLTVFmHkeJbm7ZIOoXstag(("
# this must appear before every request
base_url = "https://api.stackexchange.com/2.3/"
#%% step 1
"""
Filters are useful for saving bandwidth and getting only the fields that you need in the
response. See https://api.stackexchange.com/docs/filters for details.
There is no need for the "api_site_parameter" name if you have the site URL. See
https://api.stackexchange.com/docs, where it states:
> Each of these methods operates on a single site at a time, identified by the site
> parameter. This parameter can be the full domain name (ie. "stackoverflow.com"), or a
> short form identified by api_site_parameter on the site object.
Need the ".backoff" parameter to detect throttling. See https://api.stackexchange.com/docs/throttle
for details.
"""
r = requests.get(base_url + f"filters/create",
params={"key":api_key,
"include":".items;"\
".has_more;"\
".page;"\
".page_size;"\
".quota_max;"\
".quota_remaining;"\
".backoff;"\
"network_user.site_url;"\
"network_user.user_id",
"base":"none",
"unsafe":"false"})
network_users_filter = r.json()['items'][0]['filter']
"""
According to https://api.stackexchange.com/docs/paging, each response will
have a maximum number of 100 items (the "pagesize" parameter) under the "items"
field. Therefore, if there are more than 100 questions/answers/comment/etc., we will
need to process the first 100 questions, request the next 100 questions, process those,
and so on, until the "has_more" property is set to "False". To request the next 100
questions, we will need to set the "page" property under the ".wrapper" category to "2",
where this property was set to "1" for the first page. See
https://api.stackexchange.com/docs/wrapper for details. Keep this in mind when
making any request.
"""
has_more = True
page_num = 0
site_names = []
user_ids = []
while has_more:
if has_more:
page_num += 1
# note that the "page" parameter must start at 1. See
# https://api.stackexchange.com/docs/paging. Also, the "types" parameter is used to
# get both the main sites (e.g. math.stackexchange.com) and their corresponding
# meta sites (e.g. math.meta.stackexchange.com)
r = requests.get(base_url + f"users/{args.user_id}/associated",
params={"key":api_key,
"filter":network_users_filter,
"page":str(page_num),
"pagesize":"100",
"types":"main_site;meta_site"})
data = r.json()
# has_more will be set to False if there are no more pages to request, which
# breaks us out of this loop
has_more = data['has_more']
# extract the site names and their corresponding user ids
for item in data['items']:
user_ids.append(item['user_id'])
site_url = item['site_url']
# skip the first 8 characters in the site url to get the site name. This will be
# used later to query each site
site_names.append(site_url[8:])
print(f"Found {len(site_names)} Stack Exchange sites associated with "\
f"https://stackexchange.com/users/{args.user_id}")
"""
For questions, create a filter using the "/filters/create" method to get the
following fields from the "question" object type
(https://api.stackexchange.com/docs/types/question):
- answers
- body_markdown
- comments
- creation_date
- down_vote_count
- up_vote_count
- score
Need the "shallow_user.display_name" field to return the owner associated with
a question or answer, since the return type is "shallow_user". If the owner is not
returned, then this is a community wiki post.
For some reason, I can't request the "comment.body_markdown" field without also
requesting the "comment.body" field. If I try to do this, I won't get the
"comment.body_markdown" field in the response.
"""
r = requests.get(base_url + f"filters/create",
params={"key":api_key,
"include":".items;"\
".has_more;"\
".page;"\
".page_size;"\
".quota_max;"\
".quota_remaining;"\
".backoff;"\
"shallow_user.display_name;"\
"question.answers;"\
"question.title;"\
"question.body_markdown;"\
"question.comments;"\
"question.creation_date;"\
"question.down_vote_count;"\
"question.up_vote_count;"\
"question.score;"\
"question.owner;"\
"question.link;"\
"question.question_id;"\
"answer.body_markdown;"\
"answer.owner;"\
"answer.comments;"\
"answer.creation_date;"\
"answer.is_accepted;"\
"answer.down_vote_count;"\
"answer.up_vote_count;"\
"answer.score;"\
"comment.body;"\
"comment.body_markdown;"\
"comment.creation_date;"\
"comment.owner;"\
"comment.score",
"base":"none",
"unsafe":"false"})
questions_filter = r.json()['items'][0]['filter']
"""
For answers, create a filter using the "/filters/create" method to get the
following fields from the "answer" object type
(https://api.stackexchange.com/docs/types/answer):
- question_id (to go to the question and download it along with its answers)
"""
r = requests.get(base_url + f"filters/create",
params={"key":api_key,
"include":".items;"\
".has_more;"\
".page;"\
".page_size;"\
".quota_max;"\
".quota_remaining;"\
".backoff;"\
"answer.question_id",
"base":"none",
"unsafe":"false"})
answers_filter = r.json()['items'][0]['filter']
#%% step 2
# create the top level directory and do nothing if it already exists
top_level_dir = pathlib.Path("q_and_a")
top_level_dir.mkdir(exist_ok=True)
def write_question(target_dir,question):
# open the file "questions_dir/<question id>.md" to write to it. Note that the
# <question id> can be used to contruct the URL for the question as
# https://<site_name>/<question id>
# Also, we don't use the question title as the file name because the question
# title can contain invalid characters (such as "$" for LaTeX). We use the
# question ID instead.
# we are assuming that question IDs for each site are unique, so there is no
# need to deliberately avoid overwriting
fpath = (target_dir / str(question['question_id'])).with_suffix(".md")
# if the file already exists, then skip it to save time
if fpath.exists():
return
# see https://stackoverflow.com/a/42495690/13809128 for why "encoding" parameter is
# needed
f = fpath.open(mode="w",encoding="utf-8")
# question metadata
f.write(f"Question downloaded from {question['link']}\\\n")
creation_datetime = datetime.datetime.fromtimestamp(question['creation_date'],
tz=datetime.timezone.utc)
# question may be a community wiki, in which case it has no owner
if "owner" in question:
if "display_name" in question["owner"]:
f.write(f"Question asked by {question['owner']['display_name']} on "\
f"{creation_datetime.strftime('%Y-%m-%d')} at "\
f"{creation_datetime.strftime('%H:%M:%S')} UTC.\\\n")
else:
f.write(f"Question is community-owned and was asked on "\
f"{creation_datetime.strftime('%Y-%m-%d')} at "\
f"{creation_datetime.strftime('%H:%M:%S')} UTC.\\\n")
else:
f.write(f"Question is community-owned and was asked on "\
f"{creation_datetime.strftime('%Y-%m-%d')} at "\
f"{creation_datetime.strftime('%H:%M:%S')} UTC.\\\n")
f.write(f"Number of up votes: {question['up_vote_count']}\\\n")
f.write(f"Number of down votes: {question['down_vote_count']}\\\n")
f.write(f"Score: {question['score']}\n")
# question title
f.write(f"# {question['title']}\n")
# question body
# See https://stackoverflow.com/q/2087370/13809128 for why "html.unescape" is
# needed.
f.write(html.unescape(f"{question['body_markdown']}\n"))
# comments to the question
# it is safer to use the ".get" method on the dict "question" because it may
# be the case that the 'comments' field does not exist, such as when there are
# no comments on the question
for i,comment in enumerate(question.get('comments',[])):
f.write(f"### Comment {i+1}\n")
creation_datetime = datetime.datetime.fromtimestamp(comment['creation_date'],
tz=datetime.timezone.utc)
if "owner" in comment:
if "display_name" in comment["owner"]:
f.write(f"Comment made by {comment['owner']['display_name']} on "\
f"{creation_datetime.strftime('%Y-%m-%d')} at "\
f"{creation_datetime.strftime('%H:%M:%S')} UTC.\\\n")
else:
f.write(f"Comment made anonymously and was asked on "\
f"{creation_datetime.strftime('%Y-%m-%d')} at "\
f"{creation_datetime.strftime('%H:%M:%S')} UTC.\\\n")
else:
f.write(f"Comment made anonymously and was asked on "\
f"{creation_datetime.strftime('%Y-%m-%d')} at "\
f"{creation_datetime.strftime('%H:%M:%S')} UTC.\\\n")
f.write(f"Comment score: {comment['score']}\n\n")
f.write(html.unescape(f"{comment['body_markdown']}\n"))
# answers to the question and the comments on each answer
for i,answer in enumerate(question.get('answers',[])):
f.write(f"## Answer {i+1}\n")
creation_datetime = datetime.datetime.fromtimestamp(answer['creation_date'],
tz=datetime.timezone.utc)
if "owner" in answer:
if "display_name" in answer["owner"]:
f.write(f"Answer by {answer['owner']['display_name']} on "\
f"{creation_datetime.strftime('%Y-%m-%d')} at "\
f"{creation_datetime.strftime('%H:%M:%S')} UTC.\\\n")
else:
f.write(f"Anonymous answer that was created on "\
f"{creation_datetime.strftime('%Y-%m-%d')} at "\
f"{creation_datetime.strftime('%H:%M:%S')} UTC.\\\n")
else:
f.write(f"Anonymous answer that was created on "\
f"{creation_datetime.strftime('%Y-%m-%d')} at "\
f"{creation_datetime.strftime('%H:%M:%S')} UTC.\\\n")
if answer['is_accepted']:
f.write("This is the accepted answer.\\\n")
else:
f.write("This is not the accepted answer.\\\n")
f.write(f"Number of up votes: {answer['up_vote_count']}\\\n")
f.write(f"Number of down votes: {answer['down_vote_count']}\\\n")
f.write(f"Score: {answer['score']}\n\n")
f.write(html.unescape(f"{answer['body_markdown']}\n"))
# comments on the answer
for j,comment in enumerate(answer.get('comments',[])):
f.write(f"### Comment {j+1}\n")
creation_datetime = datetime.datetime.fromtimestamp(comment['creation_date'],
tz=datetime.timezone.utc)
if "owner" in comment:
if "display_name" in comment["owner"]:
f.write(f"Comment made by {comment['owner']['display_name']} on "\
f"{creation_datetime.strftime('%Y-%m-%d')} at "\
f"{creation_datetime.strftime('%H:%M:%S')} UTC.\\\n")
else:
f.write(f"Comment made anonymously and was asked on "\
f"{creation_datetime.strftime('%Y-%m-%d')} at "\
f"{creation_datetime.strftime('%H:%M:%S')} UTC.\\\n")
else:
f.write(f"Comment made anonymously and was asked on "\
f"{creation_datetime.strftime('%Y-%m-%d')} at "\
f"{creation_datetime.strftime('%H:%M:%S')} UTC.\\\n")
f.write(f"Comment score: {comment['score']}\n\n")
f.write(html.unescape(f"{comment['body_markdown']}\n"))
# close the file after you are done writing
f.close()
# iterate over the sites
for i,(site_name,user_id) in enumerate(zip(site_names,user_ids)):
print(f"Downloading and writing questions from site "\
f"{i+1}/{len(site_names)} ({site_name})...",end="",flush=True)
#%% step 3(a)
# create the "questions" directory for this site
questions_dir = top_level_dir / site_name / "questions"
questions_dir.mkdir(parents=True,exist_ok=True)
has_more = True
page_num = 0
while has_more:
if has_more:
page_num += 1
#%% step 3(b)
r = requests.get(base_url + f"users/{user_id}/questions",
params={"key":api_key,
"site":site_name,
"filter":questions_filter,
"page":str(page_num),
"pagesize":"100"})
data = r.json()
has_more = data['has_more']
questions = data['items']
#%% step 3(c)
# if there are no questions associated with this site, then "questions"
# will be an empty list, such that the following for loop will be skipped.
for question in questions:
write_question(questions_dir,question)
"""
According to https://api.stackexchange.com/docs/throttle:
> A dynamic throttle is also in place on a per-method level. If an application
> receives a response with the backoff field set, it must wait that many seconds
> before hitting the same method again. For the purposes of throttling, all /me
> routes are considered to be identical to their /users/{ids} equivalent. Note
> that backoff is set based on a combination of factors, and may not be
> consistently returned for the same arguments to the same method. Additionally,
> all methods (even seemingly trivial ones) may return backoff.
So, we will need to wait a certain amount of time if the "backoff" parameter is
returned in the response before making another request.
"""
if "backoff" in data:
print("We've made too many requests to the Stack Exchange API, so we will "\
f"need to wait for {data['backoff']} seconds. Please be patient...",flush=True)
time.sleep(data['backoff'] + 1) # add a second just in case
print(f"Downloading and writing the remaining questions from {site_name}...",
end="",flush=True)
print(f"Done.")
print(f"Downloading and writing answers from site "\
f"{i+1}/{len(site_names)} ({site_name})...",end="",flush=True)
#%% step 3(d)
# create the "answers" directory for this site
answers_dir = top_level_dir / site_name / "answers"
answers_dir.mkdir(parents=True,exist_ok=True)
has_more = True
page_num = 0
while has_more:
backoff = False
if has_more:
page_num += 1
#%% step 3(e)
r = requests.get(base_url + f"users/{user_id}/answers",
params={"key":api_key,
"site":site_name,
"filter":answers_filter,
"page":str(page_num),
"pagesize":"100"})
data = r.json()
has_more = data['has_more']
answers = data['items']
# in case there are no answers associated with this site, skip it
if len(answers) == 0:
continue
if "backoff" in data:
backoff = True
backoff_time = data["backoff"]
#%% step 3(f)
question_ids = ""
for i,answer in enumerate(answers):
question_ids += f"{answer['question_id']}"
# don't put a semicolon at the end of the query string as this will throw
# an error
if i < len(answers) - 1:
question_ids += ";"
#%% step 3(g)
# get all the questions associated with these answers. Since there will always
# be a maximum of 100 answers, then there will always be 100 questions, and
# so we don't need to iterate through pages here
r = requests.get(base_url + f"questions/{question_ids}",
params={"key":api_key,
"site":site_name,
"filter":questions_filter,
"pagesize":"100"})
data = r.json()
questions = data['items']
if "backoff" in data:
if backoff: # if "backoff" is already needed from the previous method
backoff_time = max(data["backoff"],backoff_time)
else:
backoff = True
backoff_time = data["backoff"]
for question in questions:
write_question(answers_dir,question)
if backoff:
print("We've made too many requests to the Stack Exchange API, so we will "\
f"need to wait for {backoff_time} seconds. Please be patient...",flush=True)
time.sleep(backoff_time + 1) # add a second just in case
print(f"Downloading and writing the remaining answers from {site_name}..."
,end="",flush=True)
print(f"Done.")