Skip to content

Commit

Permalink
Merge pull request #149 from VikParuchuri/dev
Browse files Browse the repository at this point in the history
Enable setting start page
  • Loading branch information
VikParuchuri authored May 29, 2024
2 parents a3334ce + 1e5fa80 commit aa8e7f0
Show file tree
Hide file tree
Showing 6 changed files with 29 additions and 9 deletions.
4 changes: 4 additions & 0 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,11 @@ def main():

mp.set_start_method('spawn') # Required for CUDA, forkserver doesn't work
model_lst = load_all_models()

for model in model_lst:
if model.device.type == "mps":
raise ValueError("Cannot use MPS with torch multiprocessing share_memory. You have to use CUDA or CPU. Set the TORCH_DEVICE environment variable to change the device.")

if model:
model.share_memory()

Expand Down
3 changes: 2 additions & 1 deletion convert_single.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def main():
parser.add_argument("filename", help="PDF file to parse")
parser.add_argument("output", help="Output base folder path")
parser.add_argument("--max_pages", type=int, default=None, help="Maximum number of pages to parse")
parser.add_argument("--start_page", type=int, default=None, help="Page to start processing at")
parser.add_argument("--langs", type=str, help="Languages to use for OCR, comma separated", default=None)
parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes")
args = parser.parse_args()
Expand All @@ -24,7 +25,7 @@ def main():

fname = args.filename
model_lst = load_all_models()
full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier)
full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier, start_page=args.start_page)

fname = os.path.basename(fname)
subfolder_path = save_markdown(args.output, fname, full_text, images, out_meta)
Expand Down
9 changes: 8 additions & 1 deletion marker/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def convert_single_pdf(
fname: str,
model_lst: List,
max_pages: int = None,
start_page: int = None,
metadata: Optional[Dict] = None,
langs: Optional[List[str]] = None,
batch_multiplier: int = 1
Expand Down Expand Up @@ -66,12 +67,18 @@ def convert_single_pdf(
doc,
fname,
max_pages=max_pages,
start_page=start_page
)
out_meta.update({
"toc": toc,
"pages": len(pages),
})

# Trim pages from doc to align with start page
if start_page:
for page_idx in range(start_page):
doc.del_page(0)

# Unpack models from list
texify_model, layout_model, order_model, edit_model, detection_model, ocr_model = model_lst

Expand Down Expand Up @@ -99,7 +106,7 @@ def convert_single_pdf(
annotate_block_types(pages)

# Dump debug data if flags are set
dump_bbox_debug_data(doc, pages)
dump_bbox_debug_data(doc, fname, pages)

# Find reading order for blocks
# Sort blocks by reading order
Expand Down
4 changes: 2 additions & 2 deletions marker/debug/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,12 @@ def dump_equation_debug_data(doc, images, converted_spans):
json.dump(data_lines, f)


def dump_bbox_debug_data(doc, blocks: List[Page]):
def dump_bbox_debug_data(doc, fname, blocks: List[Page]):
if not settings.DEBUG_DATA_FOLDER or settings.DEBUG_LEVEL < 2:
return

# Remove extension from doc name
doc_base = os.path.basename(doc.name).rsplit(".", 1)[0]
doc_base = fname.rsplit(".", 1)[0]

debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_bbox.json")
debug_data = []
Expand Down
16 changes: 12 additions & 4 deletions marker/pdf/extract_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,21 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page:
return out_page


def get_text_blocks(doc, fname, max_pages: Optional[int] = None) -> (List[Page], Dict):
def get_text_blocks(doc, fname, max_pages: Optional[int] = None, start_page: Optional[int] = None) -> (List[Page], Dict):
toc = get_toc(doc)

page_range = range(len(doc))
if start_page:
assert start_page < len(doc)
else:
start_page = 0

if max_pages:
range_end = min(max_pages, len(doc))
page_range = range(range_end)
if max_pages + start_page > len(doc):
max_pages = len(doc) - start_page
else:
max_pages = len(doc) - start_page

page_range = range(start_page, start_page + max_pages)

char_blocks = dictionary_output(fname, page_range=page_range, keep_chars=True, workers=settings.PDFTEXT_CPU_WORKERS)
marker_blocks = [pdftext_format_to_blocks(page, pnum) for pnum, page in enumerate(char_blocks)]
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "marker-pdf"
version = "0.2.11"
version = "0.2.12"
description = "Convert PDF to markdown with high speed and accuracy."
authors = ["Vik Paruchuri <[email protected]>"]
readme = "README.md"
Expand Down

0 comments on commit aa8e7f0

Please sign in to comment.