-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetch.py
139 lines (108 loc) · 4.45 KB
/
fetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from lxml import etree
from dotenv import load_dotenv
from tqdm import tqdm
import re
import csv
load_dotenv()
HEADER = os.getenv('HEADER')
HASH = os.getenv('HASH')
HEADERS = {
HEADER: HASH,
}
# URL of the sitemap
sitemap_url = "https://devinit.org/sitemap.xml"
# Output directory for saving the static site
output_dir = "docs"
def remove_cache_busting_hashes(text):
# Pattern to match URLs starting with "/assets", capture the file base name and extension, and remove the hash
pattern = r'(/assets/.*?)(\.[a-f0-9]{12})(\.css|\.csv|\.eot|\.gif|\.hbs|\.html|\.ico|\.jpg|\.js|\.json|\.md|\.mjs|\.png|\.svg|\.ttf|\.txt|\.webp|\.woff|\.woff2)'
# Replace the matched pattern with the cleaned URL (without the hash)
cleaned_text = re.sub(pattern, r'\1\3', text)
return cleaned_text
documents_mapping = list()
with open('docs.csv', newline='') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
doc_id = row['id']
doc_file = row['file']
doc_file_basename = os.path.basename(doc_file)
doc_path_from = f"/documents/{doc_id}/{doc_file_basename}"
doc_path_to = f"/media/{doc_file}"
doc_dict = {'from': doc_path_from, 'to': doc_path_to}
documents_mapping.append(doc_dict)
def replace_document_redirects(text):
for doc_map in documents_mapping:
text = text.replace(doc_map['from'], doc_map['to'])
return text
# Fetch and parse the sitemap
def fetch_sitemap(url):
response = requests.get(url, headers=HEADERS)
if response.status_code == 200:
sitemap_xml = response.content
sitemap = etree.fromstring(sitemap_xml)
urls = sitemap.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc")
return [url.text for url in urls]
else:
print(f"Error fetching sitemap: {response.status_code}")
return []
# Fetch HTML content from a URL
def fetch_page(url):
response = requests.get(url, headers=HEADERS)
if response.status_code == 200:
return response.text
else:
print(f"Error fetching page {url}: {response.status_code}")
return None
def make_url_path(url):
parsed_url = urlparse(url)
path = parsed_url.path
if path.endswith('/'):
path = path + "index.html"
if not path.endswith('.html'):
path = path + ".html"
# Full output path
file_path = os.path.join(output_dir, path.strip('/'))
return file_path
# Save HTML content to a file
def save_html(url, html_content):
file_path = make_url_path(url)
# Create directories if necessary
os.makedirs(os.path.dirname(file_path), exist_ok=True)
# Save the HTML file
with open(file_path, 'w', encoding='utf-8') as file:
file.write(html_content)
# Crawl all URLs in the sitemap and save them
def crawl_and_save(sitemap_url):
urls = fetch_sitemap(sitemap_url)
for url in tqdm(urls):
if url.startswith('https://devinit.org/data/spotlight-kenya'):
continue
if url.startswith('https://devinit.org/data/spotlight-uganda'):
continue
file_path = make_url_path(url)
if not os.path.exists(file_path):
html_content = fetch_page(url)
if html_content:
soup = BeautifulSoup(html_content, "html.parser")
# Find the div with class 'notice__wrapper' and remove it
notice_div = soup.find('div', class_='notice__wrapper')
if notice_div:
notice_div.decompose() # Removes the element from the DOM
# Convert the BeautifulSoup object to a string for easier manipulation
html_str = soup.prettify()
# Replace "https://devinit-prod-static.ams3.cdn.digitaloceanspaces.com/" with "/"
html_str = html_str.replace("https://devinit-prod-static.ams3.cdn.digitaloceanspaces.com/", "/")
# Replace "https://devinit.org/" with "/"
html_str = html_str.replace("https://devinit.org/", "/")
# Remove the caching hash from assets URLs
html_str = remove_cache_busting_hashes(html_str)
# Replace document redirect URLs
html_str = replace_document_redirects(html_str)
# Save the modified page
save_html(url, html_str)
if __name__ == "__main__":
crawl_and_save(sitemap_url)