forked from ChSatyaSavith/Legal_Document_Similarity
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSimilarity.py
150 lines (129 loc) · 6.24 KB
/
Similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#Importing Libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
import pandas as pd
import PyPDF2
import docx
import os
#Importing Libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
import pandas as pd
import PyPDF2
import docx
import os
#Class For Similarity Checker
class LegalDocumentSimilarity:
def __init__(self):
self.vectorizer = TfidfVectorizer()
#Initializing the TF-IDF for Law Book if it Exists
if os.path.exists('law_book.txt'):
lines = []
with open('law_book.txt') as file:
for line in file:
lines.append(line.strip('\n'))
self.book_lines = lines
self.law_corpus_tfidf = self.vectorizer.fit_transform(lines)
def update_embeddings(self,pdf_path,law_book_sentences='law_book.txt',book_path = True,folder_path = False):
#Generating Lines for the Book User Passed
self.book_lines = []
if(book_path):
text = ""
with open(pdf_path, "rb") as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text()
self.book_lines = text.split('\n')
elif(folder_path):
for filename in os.listdir(pdf_path):
if filename.endswith(".pdf"):
pdf_file_path = os.path.join(pdf_path, filename)
text = ""
with open(pdf_file_path, "rb") as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text()
self.book_lines.extend(text.split('\n'))
#Updating Text File of Law Books Here
if os.path.exists(law_book_sentences):
with open(law_book_sentences,'a') as file:
for item in self.book_lines:
file.write(item+'\n')
if not os.path.exists(law_book_sentences):
with open(law_book_sentences,'w') as file:
for item in self.book_lines:
file.write(item+'\n')
#Creating Vectorizer for the Updated Law Book
self.book_lines = []
with open(law_book_sentences,'r') as file:
for line in file:
self.book_lines.append(line.strip('\n'))
#Updating the Law Corpus TF-IDF
self.law_corpus_tfidf = self.vectorizer.fit_transform(self.book_lines)
def read_docx(self,pdf_path):
#Reading all the Documents given the Folder Path
self.doc_sentences = []
with open(pdf_path, "rb") as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page_num in range(len(pdf_reader.pages)):
text += pdf_reader.pages[page_num].extract_text()
doc_sentences = sent_tokenize(text)
self.doc_sentences.extend(doc_sentences)
self.document_tfidf = self.vectorizer.transform(self.doc_sentences)
def checkSimilarity(self):
similarity = []
similarity_score = [] # List to store similarity scores
for i, doc_sentence in enumerate(self.doc_sentences):
similarities = cosine_similarity(self.document_tfidf[i], self.law_corpus_tfidf)
index = similarities.argmax()
similarity.append(self.book_lines[index])
similarity_score.append(similarities[0][index]) # Store similarity score
# Create a DataFrame with Doc_Sentences, Similar_Line, and Similarity_Score columns
self.dataframe = pd.DataFrame()
self.dataframe["Generated Document"] = self.doc_sentences
self.dataframe['Similar_Line in Acts'] = similarity
self.dataframe['Similarity_Score'] = similarity_score
# Define a regular expression pattern to match rows with only numeric values or special characters
pattern = r'^[0-9\W_]+$'
# Filter out rows where either column contains only numeric values or special characters
self.dataframe = self.dataframe[~(self.dataframe['Generated Document'].str.match(pattern) | self.dataframe['Similar_Line in Acts'].str.match(pattern))]
self.dataframe['Generated Document'] = self.dataframe['Generated Document'].str.replace(r'\n', ' ').str.replace(r'[^a-zA-Z0-9\s]', '')
self.dataframe['Similar_Line in Acts'] = self.dataframe['Similar_Line in Acts'].str.replace(r'\n', ' ').str.replace(r'[^a-zA-Z0-9\s]', '')
self.percentage_match = (len(self.dataframe)/len(self.doc_sentences))*100
def create_pdf(self,df, filename):
c = canvas.Canvas(filename, pagesize=A4)
# Define the starting position for text
x, y = 50, A4[1] - 50 # Start from the top of the page
num = 1
for index, row in df.iterrows():
question = row['Generated Document']
answer = row['Similar_Line in Acts']
number = row['Similarity_Score']
# Write question, answer, and number to PDF
c.setFont("Helvetica", 8)
c.drawString(x, y, f"{num}) Question: {question[:130]}")
y -= 20 # Move down for answer
c.drawString(x, y, f"{question[130:]}")
y-=20
c.drawString(x, y, f"Answer: {answer}")
y-=20
c.drawString(x, y, f"Similarity: {number}")
y -= 40 # Move down for the next question
# Check if we need to start a new page
if y < 50:
c.showPage()
y = A4[1] - 50 # Start from the top of the new page
num+=1
# Save the PDF file
c.save()
def save_Acts(self,path):
self.create_pdf(self.dataframe,path)