diff --git a/README.md b/README.md index 77db148..f0f2e83 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ It presently supports: - Audio (EXIF metadata, and speech transcription) - HTML (special handling of Wikipedia, etc.) - Various other text-based formats (csv, json, xml, etc.) +- ZIP (Iterates over contents and converts each file) # Installation diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 25786f6..1ba39eb 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -12,6 +12,7 @@ import sys import tempfile import traceback +import zipfile from typing import Any, Dict, List, Optional, Union from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse @@ -836,6 +837,124 @@ def _get_mlm_description(self, local_path, extension, client, model, prompt=None return response.choices[0].message.content +class ZipConverter(DocumentConverter): + """Converts ZIP files to markdown by extracting and converting all contained files. + + The converter extracts the ZIP contents to a temporary directory, processes each file + using appropriate converters based on file extensions, and then combines the results + into a single markdown document. The temporary directory is cleaned up after processing. + + Example output format: + ```markdown + Content from the zip file `example.zip`: + + ## File: docs/readme.txt + + This is the content of readme.txt + Multiple lines are preserved + + ## File: images/example.jpg + + ImageSize: 1920x1080 + DateTimeOriginal: 2024-02-15 14:30:00 + Description: A beautiful landscape photo + + ## File: data/report.xlsx + + ## Sheet1 + | Column1 | Column2 | Column3 | + |---------|---------|---------| + | data1 | data2 | data3 | + | data4 | data5 | data6 | + ``` + + Key features: + - Maintains original file structure in headings + - Processes nested files recursively + - Uses appropriate converters for each file type + - Preserves formatting of converted content + - Cleans up temporary files after processing + """ + + def convert( + self, local_path: str, **kwargs: Any + ) -> Union[None, DocumentConverterResult]: + # Bail if not a ZIP + extension = kwargs.get("file_extension", "") + if extension.lower() != ".zip": + return None + + # Get parent converters list if available + parent_converters = kwargs.get("_parent_converters", []) + if not parent_converters: + return DocumentConverterResult( + title=None, + text_content=f"[ERROR] No converters available to process zip contents from: {local_path}", + ) + + extracted_zip_folder_name = ( + f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}" + ) + new_folder = os.path.normpath( + os.path.join(os.path.dirname(local_path), extracted_zip_folder_name) + ) + md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n" + + # Safety check for path traversal + if not new_folder.startswith(os.path.dirname(local_path)): + return DocumentConverterResult( + title=None, text_content=f"[ERROR] Invalid zip file path: {local_path}" + ) + + try: + # Extract the zip file + with zipfile.ZipFile(local_path, "r") as zipObj: + zipObj.extractall(path=new_folder) + + # Process each extracted file + for root, dirs, files in os.walk(new_folder): + for name in files: + file_path = os.path.join(root, name) + relative_path = os.path.relpath(file_path, new_folder) + + # Get file extension + _, file_extension = os.path.splitext(name) + + # Update kwargs for the file + file_kwargs = kwargs.copy() + file_kwargs["file_extension"] = file_extension + file_kwargs["_parent_converters"] = parent_converters + + # Try converting the file using available converters + for converter in parent_converters: + # Skip the zip converter to avoid infinite recursion + if isinstance(converter, ZipConverter): + continue + + result = converter.convert(file_path, **file_kwargs) + if result is not None: + md_content += f"\n## File: {relative_path}\n\n" + md_content += result.text_content + "\n\n" + break + + # Clean up extracted files if specified + if kwargs.get("cleanup_extracted", True): + shutil.rmtree(new_folder) + + return DocumentConverterResult(title=None, text_content=md_content.strip()) + + except zipfile.BadZipFile: + return DocumentConverterResult( + title=None, + text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}", + ) + except Exception as e: + return DocumentConverterResult( + title=None, + text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}", + ) + + class FileConversionException(BaseException): pass @@ -879,6 +998,7 @@ def __init__( self.register_page_converter(Mp3Converter()) self.register_page_converter(ImageConverter()) self.register_page_converter(PdfConverter()) + self.register_page_converter(ZipConverter()) def convert( self, source: Union[str, requests.Response], **kwargs: Any @@ -1034,6 +1154,8 @@ def _convert( if "mlm_model" not in _kwargs and self._mlm_model is not None: _kwargs["mlm_model"] = self._mlm_model + # Add the list of converters for nested processing + _kwargs["_parent_converters"] = self._page_converters # If we hit an error log it and keep trying try: diff --git a/tests/test_files/test_files.zip b/tests/test_files/test_files.zip new file mode 100644 index 0000000..ef49dc0 Binary files /dev/null and b/tests/test_files/test_files.zip differ diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index ac08820..9aaa37e 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -151,6 +151,12 @@ def test_markitdown_local() -> None: text_content = result.text_content.replace("\\", "") assert test_string in text_content + # Test ZIP file processing + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip")) + for test_string in DOCX_TEST_STRINGS: + text_content = result.text_content.replace("\\", "") + assert test_string in text_content + # Test Wikipedia processing result = markitdown.convert( os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL