-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf_highlight_extractor.py
54 lines (44 loc) · 1.74 KB
/
pdf_highlight_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/env python3
import fitz # PyMuPDF
# Open the PDF file
# get input path from user
pdf_path = input("Enter the path to the PDF file: ")
# Check if the path is valid
try:
doc = fitz.open(pdf_path)
except FileNotFoundError:
print("File not found.")
exit()
doc = fitz.open(pdf_path)
highlights = []
# Iterate through each page
for page_num in range(len(doc)):
page = doc.load_page(page_num)
for annot in page.annots(): # Iterate over annotations
if annot.type[0] == 8: # Check if annotation is a highlight
text = page.get_text("text", clip=annot.rect)
highlights.append((page_num + 1, text.strip())) # Store page number and text
# Print the extracted highlights with page numbers and title of the book
print("\n" + "=" * 30)
print(f"\n**** 📖 Title: {doc.metadata['title']} 📖 ****")
print("\nExtracted Highlights:\n" + "=" * 30)
for page_number, highlight in highlights:
print(f"\n📝 **Page {page_number}**")
print("-" * 30)
processed_text = highlight.replace('z\n', '\n\u2022 ')
print(processed_text)
print("\n" + "=" * 30)
# Close the PDF file
doc.close()
# Save the highlights to a text file in the same directory as the PDF, use same print style as print above
output_file = pdf_path.replace(".pdf", "_highlights.txt")
with open(output_file, "w") as file:
file.write("-" * 30 + " ")
file.write(f" 📖 Title: {doc.metadata['title']} 📖 ")
file.write("-" * 30 + "\n")
for page_number, highlight in highlights:
file.write(f"\nPage {page_number}\n")
file.write("-" * 30 + "\n")
processed_text = highlight.replace('z\n', '\n\u2022 ')
file.write(processed_text + "\n\n")
print(f"*** Saved highlights successfully to \"{output_file}\" ***")