diff --git a/pyproject.toml b/pyproject.toml index 74df032..756380a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ dependencies = [ "youtube-transcript-api", "SpeechRecognition", "pathvalidate", + "charset-normalizer", ] [project.urls] diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 96997cf..25786f6 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -26,6 +26,7 @@ import puremagic import requests from bs4 import BeautifulSoup +from charset_normalizer import from_path # Optional Transcription support try: @@ -161,9 +162,7 @@ def convert( elif "text/" not in content_type.lower(): return None - text_content = "" - with open(local_path, "rt", encoding="utf-8") as fh: - text_content = fh.read() + text_content = str(from_path(local_path).best()) return DocumentConverterResult( title=None, text_content=text_content,