From 997c7af53c6cf8b1b4a58344c95c3cb181ba6f1c Mon Sep 17 00:00:00 2001 From: Adam Fourney Date: Thu, 14 Nov 2024 07:50:21 -0800 Subject: [PATCH 1/2] Added a simple CLI. --- pyproject.toml | 3 +++ src/markitdown/__main__.py | 42 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 src/markitdown/__main__.py diff --git a/pyproject.toml b/pyproject.toml index f5ffdb3..d1dd737 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,9 @@ Source = "https://github.com/microsoft/markitdown" [tool.hatch.version] path = "src/markitdown/__about__.py" +[project.scripts] +markitdown = "markitdown.__main__:main" + [tool.hatch.envs.types] extra-dependencies = [ "mypy>=1.0.0", diff --git a/src/markitdown/__main__.py b/src/markitdown/__main__.py new file mode 100644 index 0000000..6c8a672 --- /dev/null +++ b/src/markitdown/__main__.py @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: 2024-present Adam Fourney +# +# SPDX-License-Identifier: MIT +import sys +from ._markitdown import MarkItDown + + +def main(): + if len(sys.argv) == 1: + markitdown = MarkItDown() + result = markitdown.convert_stream(sys.stdin.buffer) + print(result.text_content) + elif len(sys.argv) == 2: + markitdown = MarkItDown() + result = markitdown.convert(sys.argv[1]) + print(result.text_content) + else: + sys.stderr.write( + """ +SYNTAX: + + markitdown + If FILENAME is empty, markitdown reads from stdin. + +EXAMPLE: + + markitdown example.pdf + + OR + + cat example.pdf | markitdown + + OR + + markitdown < example.pdf +""".strip() + + "\n" + ) + + +if __name__ == "__main__": + main() From 2eab564c4cfdf0dfef858a3e02fb1c5d0fd055d2 Mon Sep 17 00:00:00 2001 From: Adam Fourney Date: Thu, 14 Nov 2024 10:23:40 -0800 Subject: [PATCH 2/2] Fix continue trying on errors. --- src/markitdown/_markitdown.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 255337f..a0c479e 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -11,6 +11,7 @@ import subprocess import sys import tempfile +import traceback from typing import Any, Dict, List, Optional, Union from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse @@ -913,7 +914,9 @@ def convert_local( # Get extension alternatives from the path and puremagic base, ext = os.path.splitext(path) self._append_ext(extensions, ext) - self._append_ext(extensions, self._guess_ext_magic(path)) + + for g in self._guess_ext_magic(path): + self._append_ext(extensions, g) # Convert return self._convert(path, extensions, **kwargs) @@ -940,7 +943,8 @@ def convert_stream( fh.close() # Use puremagic to check for more extension options - self._append_ext(extensions, self._guess_ext_magic(temp_path)) + for g in self._guess_ext_magic(temp_path): + self._append_ext(extensions, g) # Convert result = self._convert(temp_path, extensions, **kwargs) @@ -1032,10 +1036,10 @@ def _convert( _kwargs["mlm_model"] = self._mlm_model # If we hit an error log it and keep trying - # try: - res = converter.convert(local_path, **_kwargs) - # except Exception: - # error_trace = ("\n\n" + traceback.format_exc()).strip() + try: + res = converter.convert(local_path, **_kwargs) + except Exception: + error_trace = ("\n\n" + traceback.format_exc()).strip() if res is not None: # Normalize the content @@ -1074,10 +1078,15 @@ def _guess_ext_magic(self, path): # Use puremagic to guess try: guesses = puremagic.magic_file(path) - if len(guesses) > 0: - ext = guesses[0].extension.strip() + extensions = list() + for g in guesses: + ext = g.extension.strip() if len(ext) > 0: - return ext + if not ext.startswith("."): + ext = "." + ext + if ext not in extensions: + extensions.append(ext) + return extensions except FileNotFoundError: pass except IsADirectoryError: