From 436407288f01b5a2c31111062b0c2ac959dad443 Mon Sep 17 00:00:00 2001 From: afourney Date: Fri, 3 Jan 2025 16:03:11 -0800 Subject: [PATCH] If puremagic has no guesses, try again after ltrim. (#260) --- src/markitdown/_markitdown.py | 19 +++++++++++++++++++ tests/test_markitdown.py | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 50c83b4..aceaa86 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -1594,6 +1594,25 @@ def _guess_ext_magic(self, path): # Use puremagic to guess try: guesses = puremagic.magic_file(path) + + # Fix for: https://github.com/microsoft/markitdown/issues/222 + # If there are no guesses, then try again after trimming leading ASCII whitespaces. + # ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f' + # (space, tab, newline, carriage return, vertical tab, form feed). + if len(guesses) == 0: + with open(path, "rb") as file: + while True: + char = file.read(1) + if not char: # End of file + break + if not char.isspace(): + file.seek(file.tell() - 1) + break + try: + guesses = puremagic.magic_stream(file) + except puremagic.main.PureError: + pass + extensions = list() for g in guesses: ext = g.extension.strip() diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 9dc7374..e2d2e75 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -259,7 +259,7 @@ def test_markitdown_local() -> None: # Test input with leading blank characters input_data = b" \n\n\n

Test

" - result = markitdown.convert_stream(io.BytesIO(input_data), file_extension=".html") + result = markitdown.convert_stream(io.BytesIO(input_data)) assert "# Test" in result.text_content