Skip to content

Commit

Permalink
If puremagic has no guesses, try again after ltrim. (#260)
Browse files Browse the repository at this point in the history
  • Loading branch information
afourney authored Jan 4, 2025
1 parent 731b39e commit 4364072
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 1 deletion.
19 changes: 19 additions & 0 deletions src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -1594,6 +1594,25 @@ def _guess_ext_magic(self, path):
# Use puremagic to guess
try:
guesses = puremagic.magic_file(path)

# Fix for: https://github.com/microsoft/markitdown/issues/222
# If there are no guesses, then try again after trimming leading ASCII whitespaces.
# ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
# (space, tab, newline, carriage return, vertical tab, form feed).
if len(guesses) == 0:
with open(path, "rb") as file:
while True:
char = file.read(1)
if not char: # End of file
break
if not char.isspace():
file.seek(file.tell() - 1)
break
try:
guesses = puremagic.magic_stream(file)
except puremagic.main.PureError:
pass

extensions = list()
for g in guesses:
ext = g.extension.strip()
Expand Down
2 changes: 1 addition & 1 deletion tests/test_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ def test_markitdown_local() -> None:

# Test input with leading blank characters
input_data = b" \n\n\n<html><body><h1>Test</h1></body></html>"
result = markitdown.convert_stream(io.BytesIO(input_data), file_extension=".html")
result = markitdown.convert_stream(io.BytesIO(input_data))
assert "# Test" in result.text_content


Expand Down

0 comments on commit 4364072

Please sign in to comment.