-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
101 lines (78 loc) · 2.57 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import json
import os
import os.path
from time import sleep
import requests
from dotenv import load_dotenv
from get_num_of_items import get_num_of_items
# Little baby version of Python script that grabs a list of books available
# from O'Reilly Learning. It works but not best practices.
# Load the API key from .env file
load_dotenv()
# Initialize the header var so we don't end up with a potentially
# unbound error later.
header = ''
# If we have an API key, create a header.
if 'OREILLY_API_KEY' in os.environ:
OREILLY_API_KEY = os.getenv('OREILLY_API_KEY')
# Add the API key to the header
header = {
'Authorization': 'Token {}'.format(OREILLY_API_KEY),
}
start_page = 0
current_page = start_page
# Get the total number of items available for our query from the O'Reilly Search API.
end_page = int(get_num_of_items() / 200) # Use 1 for testing, don't hammer the API
# Initialize a list we'll store all the item data in.
items = []
per_page = 200 # Use 5 for testing
highlight = 0
# Provide a list of fields you want the API NOT TO return.
exclude_fields = [
'academic_excluded',
'archive_id',
'chapter_title',
'duration_seconds',
'has_assessment',
'source']
# Create a string of query params using the exclude fields above
exclude_field_string = ''
for field in exclude_fields:
exclude_field_string += f'&exclude_fields={field}'
# Our initial API call
url = f'https://learning.oreilly.com/api/v2/search/?query=*&formats=book&limit={per_page}&highlight={highlight}&{exclude_field_string}'
while current_page is not end_page + 1:
# Make the call, store reply from API in response.
if header: # If we have an API key, use it.
response = requests.get(url, headers=header)
else:
response = requests.get(url)
# Store only the JSON portion of the response
json_response = response.json()
# Get the next url to pull
next_url = json_response['next']
print(next_url)
if next_url is None:
print("No more pages to pull.")
break
# We only need results portion of the JSON
json_items = json_response['results']
# Add each book to our list
for item in json_items:
items.append(item)
# So we can see progress
current_page += 1
print(f'Current Page: {current_page}')
# Update URL for next call
url = next_url
# Don't hammer the API
sleep(1)
# Write the list to a file as JSON.
FILE = "oreilly.json"
with open(
os.path.join(FILE),
'a+',
encoding='utf-8',
errors="replace"
) as outfile:
json.dump(items, outfile, indent=2)