-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdfdiffer.py
144 lines (111 loc) · 4.73 KB
/
pdfdiffer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import os
import datetime
import argparse
from pdf2image import convert_from_path
import cv2
import numpy as np
from PIL import Image
from bs4 import BeautifulSoup
class UnionFind:
def __init__(self):
self.parent = {}
def find(self, x):
if x not in self.parent:
self.parent[x] = x
elif self.parent[x] != x:
self.parent[x] = self.find(self.parent[x])
return self.parent[x]
def union(self, x, y):
self.parent[self.find(x)] = self.find(y)
def overlapping_boxes(boxes):
uf = UnionFind()
for i in range(len(boxes)):
for j in range(i + 1, len(boxes)):
box1 = boxes[i]
box2 = boxes[j]
if box1[0] < box2[0] + box2[2] and box1[0] + box1[2] > box2[0] and box1[1] < box2[1] + box2[3] and box1[1] + box1[3] > box2[1]:
uf.union(i, j)
groups = {}
for i in range(len(boxes)):
group = uf.find(i)
if group not in groups:
groups[group] = []
groups[group].append(boxes[i])
return groups.values()
def compare_images(img1_path, img2_path, output_path):
img1 = cv2.imread(img1_path)
img2 = cv2.imread(img2_path)
# Convert the images to grayscale
img1_gray = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
img2_gray = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)
# Compute the absolute difference between the two images
diff = cv2.absdiff(img1_gray, img2_gray)
# Threshold the difference image (this will reveal regions of non-zero pixels)
_, thresh = cv2.threshold(diff, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
# Find contours in the threshold image
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# Set the amount of padding
padding = 10
# Collect the bounding rectangles around each contour, with padding
boxes = []
for contour in contours:
x, y, w, h = cv2.boundingRect(contour)
boxes.append((x - padding, y - padding, w + 2 * padding, h + 2 * padding))
# Group the overlapping boxes
groups = overlapping_boxes(boxes)
# Draw the combined bounding rectangles
for group in groups:
min_x = min(box[0] for box in group)
min_y = min(box[1] for box in group)
max_x = max(box[0] + box[2] for box in group)
max_y = max(box[1] + box[3] for box in group)
cv2.rectangle(img2, (min_x, min_y), (max_x, max_y), (0, 0, 255), 3)
# Save the image with the differences highlighted
cv2.imwrite(output_path, img2)
def main():
parser = argparse.ArgumentParser(description='Compare two PDFs and generate an HTML page showing the original PDF next to the one with changes highlighted.')
parser.add_argument('pdf1', help='The path to the first PDF file')
parser.add_argument('pdf2', help='The path to the second PDF file')
args = parser.parse_args()
# Convert the PDFs to images
pages1 = convert_from_path(args.pdf1)
pages2 = convert_from_path(args.pdf2)
# Get the max length for equal number of pages
max_len = max(len(pages1), len(pages2))
if len(pages1) < max_len:
for _ in range(max_len - len(pages1)):
blank = Image.new('RGB', pages1[0].size, (255, 255, 255))
pages1.append(blank)
elif len(pages2) < max_len:
for _ in range(max_len - len(pages2)):
blank = Image.new('RGB', pages2[0].size, (255, 255, 255))
pages2.append(blank)
# Create the output directories
output_dir = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
os.makedirs(os.path.join(output_dir, 'images'), exist_ok=True)
# Load the HTML template
with open('template.html') as f:
soup = BeautifulSoup(f, 'html.parser')
# Find the columns in the HTML
columns = soup.find_all(class_='column')
original_column = columns[0]
changes_column = columns[1]
for i in range(max_len):
# Save the images from the PDFs
img1_path = os.path.join(output_dir, 'images', f'pdf1_page{i+1}.png')
img2_path = os.path.join(output_dir, 'images', f'pdf2_page{i+1}.png')
pages1[i].save(img1_path, 'PNG')
pages2[i].save(img2_path, 'PNG')
# Compare the images and save the result
output_path = os.path.join(output_dir, 'images', f'output_page{i+1}.png')
compare_images(img1_path, img2_path, output_path)
# Add the images to the columns
img1 = soup.new_tag('img', src=f'images/pdf1_page{i+1}.png')
img2 = soup.new_tag('img', src=f'images/output_page{i+1}.png')
original_column.append(img1)
changes_column.append(img2)
# Save the modified HTML
with open(os.path.join(output_dir, 'comparison.html'), 'w') as f:
f.write(str(soup))
if __name__ == "__main__":
main()