-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetch_pages.py
130 lines (100 loc) · 4.57 KB
/
fetch_pages.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from lxml import etree
from dotenv import load_dotenv
from tqdm import tqdm
import re
import csv
load_dotenv()
HEADER = os.getenv('HEADER')
HASH = os.getenv('HASH')
HEADERS = {
HEADER: HASH,
}
# Output directory for saving the static site
output_dir = "docs"
def remove_cache_busting_hashes(text):
# Pattern to match URLs starting with "/assets", capture the file base name and extension, and remove the hash
pattern = r'(/assets/.*?)(\.[a-f0-9]{12})(\.css|\.csv|\.eot|\.gif|\.hbs|\.html|\.ico|\.jpg|\.js|\.json|\.md|\.mjs|\.png|\.svg|\.ttf|\.txt|\.webp|\.woff|\.woff2)'
# Replace the matched pattern with the cleaned URL (without the hash)
cleaned_text = re.sub(pattern, r'\1\3', text)
return cleaned_text
# Fetch HTML content from a URL
def fetch_page(url):
response = requests.get(url, headers=HEADERS)
if response.status_code == 200:
return response.text
else:
print(f"Error fetching page {url}: {response.status_code}")
return None
def make_url_path(url):
parsed_url = urlparse(url)
path = parsed_url.path
query = parsed_url.query.replace("=", "_")
path = f"{path}{query}/"
if path.endswith('/'):
path = path + "index.html"
if not path.endswith('.html'):
path = path + ".html"
# Full output path
file_path = os.path.join(output_dir, path.strip('/'))
return file_path
# Save HTML content to a file
def save_html(url, html_content):
file_path = make_url_path(url)
# Create directories if necessary
os.makedirs(os.path.dirname(file_path), exist_ok=True)
# Save the HTML file
with open(file_path, 'w', encoding='utf-8') as file:
file.write(html_content)
# Crawl all page URLs and save them
def crawl_and_save():
resource_urls = ["https://devinit.org/resources/?page=" + str(i) for i in range(1, 49)]
blog_urls = ["https://devinit.org/blog/?page=" + str(i) for i in range(1, 34)]
urls = resource_urls + blog_urls
for url in tqdm(urls):
file_path = make_url_path(url)
if not os.path.exists(file_path):
html_content = fetch_page(url)
if html_content:
soup = BeautifulSoup(html_content, "html.parser")
# Find the div with class 'notice__wrapper' and remove it
notice_div = soup.find('div', class_='notice__wrapper')
if notice_div:
notice_div.decompose() # Removes the element from the DOM
# JavaScript code you want to inject
js_code = """
<script>
// Get the URLSearchParams object for the current URL
const urlParams = new URLSearchParams(window.location.search);
const pageParam = urlParams.get('page'); // Get the 'page' parameter
if (pageParam) {
// Get the current URL path and origin (e.g., https://devinit.org/resources/)
const currentUrl = window.location.origin + window.location.pathname;
// Remove any existing page_X segment from the path
const newPath = currentUrl.replace(/\\/page_\\d+\\//, `/page_${pageParam}/`);
// Redirect if the new path differs from the current path
if (newPath + window.location.search !== window.location.href) {
window.location.href = newPath;
}
}
</script>
"""
# Create a new Tag object for the script tag
script_tag = BeautifulSoup(js_code, "html.parser")
# Append the script tag to the end of the body
soup.body.append(script_tag)
# Convert the BeautifulSoup object to a string for easier manipulation
html_str = soup.prettify()
# Replace "https://devinit-prod-static.ams3.cdn.digitaloceanspaces.com/" with "/"
html_str = html_str.replace("https://devinit-prod-static.ams3.cdn.digitaloceanspaces.com/", "/")
# Replace "https://devinit.org/" with "/"
html_str = html_str.replace("https://devinit.org/", "/")
# Remove the caching hash from assets URLs
html_str = remove_cache_busting_hashes(html_str)
# Save the modified page
save_html(url, html_str)
if __name__ == "__main__":
crawl_and_save()