-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrename_pdfs.py
234 lines (194 loc) · 7.97 KB
/
rename_pdfs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
#!/usr/bin/env python3
"""
PDF Renamer Script
This script renames PDF files based on their content, extracting the author and title
information using AI-powered text analysis. It handles encrypted PDFs and provides
a colorful, informative CLI output with a progress bar and results table.
Usage:
python rename_pdfs.py <pdf_folder>
Requirements:
- Python 3.6+
- Libraries: pdfplumber, PyPDF2, ollama, rich
Date: August 26, 2024
Version: 1.1
"""
import os
import sys
import pdfplumber
import ollama
import json
import re
from rich.console import Console
from rich.table import Table
from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn
from rich.panel import Panel
from PyPDF2 import PdfReader
console = Console()
def is_pdf_encrypted(pdf_path: str) -> bool:
"""
Check if a PDF file is encrypted.
Args:
pdf_path (str): Path to the PDF file.
Returns:
bool: True if the PDF is encrypted, False otherwise.
"""
try:
with pdfplumber.open(pdf_path) as pdf:
return False
except:
try:
with open(pdf_path, 'rb') as file:
reader = PdfReader(file)
return reader.is_encrypted
except:
return True
def extract_text_from_pdf(pdf_path: str) -> str | None:
"""
Extract text from a PDF file.
Args:
pdf_path (str): Path to the PDF file.
Returns:
str | None: Extracted text from the PDF, or None if the PDF is encrypted.
"""
if is_pdf_encrypted(pdf_path):
return None
try:
with pdfplumber.open(pdf_path) as pdf:
text = ""
for page in pdf.pages[:2]:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text.strip() or "No text could be extracted from the PDF."
except Exception as e:
console.print(f"[bold yellow]pdfplumber failed: {e}. Trying PyPDF2...[/bold yellow]")
try:
with open(pdf_path, 'rb') as file:
reader = PdfReader(file)
text = ""
for page in reader.pages[:2]:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text.strip() or "No text could be extracted from the PDF."
except Exception as e:
console.print(f"[bold red]Error extracting text from {pdf_path}: {e}[/bold red]")
return "Error extracting text from PDF."
def refine_with_llama3(text: str) -> tuple[str | None, str | None]:
"""
Use Llama3.1 AI model to extract title and author from text.
Args:
text (str): The text to analyze.
Returns:
tuple[str | None, str | None]: A tuple containing the title and author,
or (None, None) if extraction failed.
"""
if not text:
return None, None
prompt = f"Given the following text, extract the title and author. If you can't determine either, return null. Respond in JSON format with 'title' and 'author' keys:\n\n{text[:500]}"
try:
response = ollama.chat(model="llama3.1", messages=[{"role": "user", "content": prompt}])
json_match = re.search(r'\{.*\}', response['message']['content'], re.DOTALL)
if json_match:
json_str = json_match.group(0)
# Clean up the JSON string
json_str = re.sub(r'(?<!\\)"([^"]*?)"(?=\s*:)', r'"\1"', json_str)
json_str = json_str.replace("'", '"')
json_str = re.sub(r'(\w+):', r'"\1":', json_str) # Ensure all keys are quoted
json_str = re.sub(r',\s*}', '}', json_str) # Remove trailing commas
result = json.loads(json_str)
title = result.get('title')
author = result.get('author')
# Ensure title and author are strings or None
title = str(title) if title and title != "null" else None
author = str(author) if author and author != "null" else None
return title_case(title) if title else None, title_case(author) if author else None
return None, None
except Exception as e:
console.print(f"[bold red]Error in Ollama call: {e}[/bold red]")
return None, None
def title_case(s: str) -> str:
"""
Convert a string to title case, handling exceptions for common words.
Args:
s (str): The string to convert.
Returns:
str: The string in title case.
"""
if not s:
return s
exceptions = {'a', 'an', 'and', 'as', 'at', 'but', 'by', 'for', 'if', 'in', 'of', 'on', 'or', 'the', 'to', 'with'}
word_list = re.findall(r"[\w']+|[.,!?;]", s.lower())
return " ".join(word.capitalize() if i == 0 or word not in exceptions else word for i, word in enumerate(word_list))
def clean_filename(text: str) -> str:
"""
Clean up a string to be used as a filename.
Args:
text (str): The text to clean.
Returns:
str: The cleaned text suitable for use in a filename.
"""
return ''.join(c for c in text if c.isalnum() or c in ' -_')[:100].strip() if text else "Unknown"
def rename_pdf(pdf_path: str) -> tuple[str, str | None, str | None, str]:
"""
Rename a PDF file based on its content.
Args:
pdf_path (str): Path to the PDF file.
Returns:
tuple[str, str | None, str | None, str]: A tuple containing the original filename,
extracted author, extracted title, and new filename.
"""
text = extract_text_from_pdf(pdf_path)
if text is None:
return os.path.basename(pdf_path), "Encrypted", "Encrypted", os.path.basename(pdf_path)
title, author = refine_with_llama3(text)
clean_title = clean_filename(title) or "Unknown Title"
clean_author = clean_filename(author) or "Unknown Author"
new_name = f"{clean_author} - {clean_title}.pdf"
new_path = os.path.join(os.path.dirname(pdf_path), new_name)
if os.path.exists(new_path):
base, ext = os.path.splitext(new_path)
counter = 1
while os.path.exists(f"{base}_{counter}{ext}"):
counter += 1
new_path = f"{base}_{counter}{ext}"
if clean_author != "Unknown" or clean_title != "Unknown Title":
os.rename(pdf_path, new_path)
else:
new_path = pdf_path # Keep the original name if we couldn't extract meaningful info
return os.path.basename(pdf_path), author, title, os.path.basename(new_path)
def main(pdf_folder: str):
"""
Main function to process all PDFs in a folder.
Args:
pdf_folder (str): Path to the folder containing PDF files.
"""
if not os.path.isdir(pdf_folder):
console.print(f"[bold red]Error: {pdf_folder} is not a valid directory.[/bold red]")
sys.exit(1)
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
results = []
with Progress(SpinnerColumn(), "[progress.percentage]{task.percentage:>3.0f}%",
BarColumn(), "{task.completed}/{task.total}",
TextColumn("[bold blue]{task.description}"),
console=console, transient=True) as progress:
task = progress.add_task("[cyan]Processing PDFs...", total=len(pdf_files))
for filename in pdf_files:
pdf_path = os.path.join(pdf_folder, filename)
result = rename_pdf(pdf_path)
results.append(result)
progress.update(task, advance=1, description=f"[cyan]Processing: {filename}")
table = Table(title="PDF Renaming Results")
table.add_column("Original Filename", style="cyan")
table.add_column("Author", style="magenta")
table.add_column("Title", style="green")
table.add_column("New Filename", style="yellow")
for result in results:
table.add_row(*result)
console.print(Panel(table, expand=False))
if __name__ == "__main__":
if len(sys.argv) != 2:
console.print("[bold red]Usage: python rename_pdfs.py <pdf_folder>[/bold red]")
sys.exit(1)
pdf_folder = sys.argv[1]
main(pdf_folder)