-
Notifications
You must be signed in to change notification settings - Fork 0
/
file_text_extractor.py
78 lines (61 loc) · 3.22 KB
/
file_text_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import os
import pandas as pd
from utils.utils import pdf_loader, write_text_file
def pdf_text_extractor(source_filepath: str, filename: str, destination_filepath: str):
"""
Extracts the text from a PDF file and save it in a text file
:param str source_filepath: The file path of the source directory
:param str filename: PDF file name
:param str destination_filepath: The file path of the destination directory
"""
print(f'\nExtracting text from the PDF: {filename}')
#Extracting the text from the PDF file
complete_file_path = f'{source_filepath}/{filename}'
pdf_text = pdf_loader(complete_file_path)
#Saving the text in a text file
filename = ''.join(filename.split('.')[:-1])
complete_dest_file_path = f'{destination_filepath}/{filename}.txt'
filesaved = write_text_file(complete_dest_file_path, pdf_text)
print(f'Text file saved successfully: {filesaved}')
def all_pdf_text_extraction(source_filepath: str, destination_filepath: str):
"""
Extracts text from all the PDF's in the source directory
:param str source_filepath: The file path of the source directory
:param str destination_filepath: The file path of the destination directory
"""
print(f'\nExtracting text from all PDF\'s from the directory: {source_filepath}')
for filename in os.listdir(source_filepath):
#Checking for the PDF file
if not filename.endswith('.pdf'): continue
#Extracting the text
pdf_text_extractor(source_filepath, filename, destination_filepath)
def read_csv_file(source_filepath: str, filename: str, columns_to_return: list, return_rows: int = None):
"""
Reads a CSV file and return the content as a DataFrame Object
:param str source_filepath: The file path of the source directory
:param str filename: CSV file name
:param list columns_to_return: The list of column names to return in the final DataFrame
:param int return_rows: If not None, the total rows to be returned from the DataFrame
:return DataFrame: The DataFrame
"""
#Reading the CSV file with the appropiate encoding
data = pd.read_csv(f'{source_filepath}/{filename}', encoding='ISO-8859-1')
#Extracting rows where the value of the specified columns are not empty
mask = data[columns_to_return].apply(lambda col: col != '-', axis = 0).all(axis = 1)
data = data[mask]
data = data[columns_to_return]
#Checking for returning rows
if return_rows:
data = data.sample(return_rows)
#Returning the final dataframe
return data.reset_index(drop = True)
if __name__ == "__main__":
print('\nLLMs4SchemaDiscovery Framework -- A Human-in-the-Loop Workflow for Scientific Schema Mining with Large Language Models ')
print('Formatting Knowledge Base - Converting PDF Documents to Text Files')
print('\nPlease input the directory location containing the PDF documents')
source_dir_path = input('Directory Path> ')
print('\nPlease input the directory Location to store the converted text files')
destination_dir_path = input('Directory Path> ')
#Executing PDF to text conversion
all_pdf_text_extraction(source_dir_path, destination_dir_path)
print('\nPDF documents successfully convert to text format!')