This repository has been archived by the owner on Jan 29, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 51
/
index_algolia.py
101 lines (78 loc) · 3.67 KB
/
index_algolia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from algoliasearch.search_client import SearchClient
import argparse
import glob
from bs4 import BeautifulSoup
import hashlib
parser = argparse.ArgumentParser()
parser.add_argument('--algolia-app-id', help='Algolia Application ID')
parser.add_argument('--algolia-api-key', help='Algolia Admin API Key')
parser.add_argument('--algolia-index-name', help='Algolia Index Name')
parser.add_argument('--html-build-dir', help='Sphinx HTML build directory')
# Path relative to build dir
INDEX_BLACKLIST = ["search.html", "genindex.html"]
def parse_pages(html_build_dir):
pages = []
for filepath in glob.iglob(html_build_dir + '/**/*.html', recursive=True):
relative_path = filepath.replace(html_build_dir, "")
full_path = ('https://docs.aiven.io' + relative_path).replace(".html", "").replace("/index", "") # Remove .html and /index.html from the URL
if relative_path in INDEX_BLACKLIST:
print(f"Skipping {filepath}")
continue
with open(filepath) as file:
doc = BeautifulSoup(file.read(), 'html.parser')
#set initial value for title
title = doc.title.text
elements = doc.select('div.article-container')[0]
if elements:
# Extract title from h1 tag and remove it
for h1 in elements.select('h1'):
# Decompose the a tag in the h1 tag
for a in h1.select('a'):
a.decompose()
title = h1.text.strip()
h1.decompose()
# remove admonition
for admonition in elements.select('div.admonition'):
admonition.decompose()
# remove code block
for code in elements.select('div.highlight-shell.notranslate'):
code.decompose()
# remove tables of contents
for toc in elements.select('div.toctree-wrapper'):
toc.decompose()
# remove header links
for headerlink in elements.select('a.headerlink'):
headerlink.decompose()
# remove preamble links etc
for backtotop in elements.select('a.back-to-top'):
backtotop.decompose()
for icons in elements.select('div.content-icon-container'):
icons.decompose()
body = elements.text.strip()
pages.append({
'title': title,
'body': body,
'slug': full_path,
'facetingType': 'documentation',
'popularity': 4,
'_type': 'documentation',
'__i18n_lang': 'en',
'isHidden': False,
'endDate_timestemp': 4845516771877, # 100 years from now
'objectID': hashlib.sha256(relative_path.encode("utf-8")).hexdigest() # Use the URL hash as the objectID
})
print(f"Parsed {filepath}")
return pages
def index_pages(client, index_name, pages):
index = client.init_index(index_name)
# Add new objects to the index or update existing ones
index.save_objects(pages, {'autoGenerateObjectIDIfNotExist': True})
print(f"Indexed {len(pages)} pages.")
if __name__ == '__main__':
args = parser.parse_args()
# https://www.algolia.com/doc/api-client/getting-started/install/python/?client=python
# Connect and authenticate with your Algolia app
client = SearchClient.create(args.algolia_app_id, args.algolia_api_key)
# Create a new index and add a record
pages = parse_pages(args.html_build_dir)
index_pages(client, args.algolia_index_name, pages)