forked from VikParuchuri/marker
-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert.py
executable file
·133 lines (100 loc) · 4.9 KB
/
convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
os.environ["IN_STREAMLIT"] = "true" # Avoid multiprocessing inside surya
os.environ["PDFTEXT_CPU_WORKERS"] = "1" # Avoid multiprocessing inside pdftext
import pypdfium2 # Needs to be at the top to avoid warnings
import argparse
import torch.multiprocessing as mp
from tqdm import tqdm
import math
from marker.convert import convert_single_pdf
from marker.output import markdown_exists, save_markdown
from marker.pdf.utils import find_filetype
from marker.pdf.extract_text import get_length_of_text
from marker.models import load_all_models
from marker.settings import settings
from marker.logger import configure_logging
import traceback
import json
configure_logging()
def worker_init(shared_model):
global model_refs
model_refs = shared_model
def worker_exit():
global model_refs
del model_refs
def process_single_pdf(args):
filepath, out_folder, metadata, min_length = args
fname = os.path.basename(filepath)
if markdown_exists(out_folder, fname):
return
try:
# Skip trying to convert files that don't have a lot of embedded text
# This can indicate that they were scanned, and not OCRed properly
# Usually these files are not recent/high-quality
if min_length:
filetype = find_filetype(filepath)
if filetype == "other":
return 0
length = get_length_of_text(filepath)
if length < min_length:
return
full_text, images, out_metadata = convert_single_pdf(filepath, model_refs, metadata=metadata)
if len(full_text.strip()) > 0:
save_markdown(out_folder, fname, full_text, images, out_metadata)
else:
print(f"Empty file: {filepath}. Could not convert.")
except Exception as e:
print(f"Error converting {filepath}: {e}")
print(traceback.format_exc())
def main():
parser = argparse.ArgumentParser(description="Convert multiple pdfs to markdown.")
parser.add_argument("in_folder", help="Input folder with pdfs.")
parser.add_argument("out_folder", help="Output folder")
parser.add_argument("--chunk_idx", type=int, default=0, help="Chunk index to convert")
parser.add_argument("--num_chunks", type=int, default=1, help="Number of chunks being processed in parallel")
parser.add_argument("--max", type=int, default=None, help="Maximum number of pdfs to convert")
parser.add_argument("--workers", type=int, default=5, help="Number of worker processes to use")
parser.add_argument("--metadata_file", type=str, default=None, help="Metadata json file to use for filtering")
parser.add_argument("--min_length", type=int, default=None, help="Minimum length of pdf to convert")
args = parser.parse_args()
in_folder = os.path.abspath(args.in_folder)
out_folder = os.path.abspath(args.out_folder)
files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)]
files = [f for f in files if os.path.isfile(f)]
os.makedirs(out_folder, exist_ok=True)
# Handle chunks if we're processing in parallel
# Ensure we get all files into a chunk
chunk_size = math.ceil(len(files) / args.num_chunks)
start_idx = args.chunk_idx * chunk_size
end_idx = start_idx + chunk_size
files_to_convert = files[start_idx:end_idx]
# Limit files converted if needed
if args.max:
files_to_convert = files_to_convert[:args.max]
metadata = {}
if args.metadata_file:
metadata_file = os.path.abspath(args.metadata_file)
with open(metadata_file, "r") as f:
metadata = json.load(f)
total_processes = min(len(files_to_convert), args.workers)
# Dynamically set GPU allocation per task based on GPU ram
if settings.CUDA:
tasks_per_gpu = settings.INFERENCE_RAM // settings.VRAM_PER_TASK if settings.CUDA else 0
total_processes = min(tasks_per_gpu, total_processes)
mp.set_start_method('spawn') # Required for CUDA, forkserver doesn't work
model_lst = load_all_models()
for model in model_lst:
if model is None:
continue
if model.device.type == "mps":
raise ValueError("Cannot use MPS with torch multiprocessing share_memory. You have to use CUDA or CPU. Set the TORCH_DEVICE environment variable to change the device.")
model.share_memory()
print(f"Converting {len(files_to_convert)} pdfs in chunk {args.chunk_idx + 1}/{args.num_chunks} with {total_processes} processes, and storing in {out_folder}")
task_args = [(f, out_folder, metadata.get(os.path.basename(f)), args.min_length) for f in files_to_convert]
with mp.Pool(processes=total_processes, initializer=worker_init, initargs=(model_lst,)) as pool:
list(tqdm(pool.imap(process_single_pdf, task_args), total=len(task_args), desc="Processing PDFs", unit="pdf"))
pool._worker_handler.terminate = worker_exit
# Delete all CUDA tensors
del model_lst
if __name__ == "__main__":
main()