From 039d36368b7ee9374a37267762ef83c83e652da4 Mon Sep 17 00:00:00 2001 From: Jahziah Wagner Date: Sat, 20 Jul 2024 18:19:20 +0200 Subject: [PATCH] MIME types too unreliable for use --- pyproject.toml | 3 +- requirements.txt | 5 +- src/claudesync/cli/api.py | 2 - src/claudesync/cli/sync.py | 10 +-- src/claudesync/config_manager.py | 4 +- src/claudesync/utils.py | 109 ++++++++++++++++++------------- 6 files changed, 74 insertions(+), 59 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index da3c0bf..a0c3fdc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "claudesync" -version = "0.3.1" +version = "0.3.2" authors = [ {name = "Jahziah Wagner", email = "jahziah.wagner+pypi@gmail.com"}, ] @@ -26,6 +26,7 @@ dependencies = [ "pytest", "pytest-cov", "click_completion", + "chardet", ] [project.urls] diff --git a/requirements.txt b/requirements.txt index 526b5ee..c967083 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,5 @@ requests>=2.32.3 pathspec>=0.12.1 crontab>=1.0.1 setuptools>=65.5.1 -pytest>=8.2.2 -pytest-cov>=5.0.0 -click_completion>=0.5.2 \ No newline at end of file +claudesync>=0.3.2 +chardet>=5.2.0 \ No newline at end of file diff --git a/src/claudesync/cli/api.py b/src/claudesync/cli/api.py index ebe99fd..88f1255 100644 --- a/src/claudesync/cli/api.py +++ b/src/claudesync/cli/api.py @@ -64,5 +64,3 @@ def max_filesize(config, size): return config.set("max_file_size", size) click.echo(f"Maximum file size set to {size} bytes.") - - diff --git a/src/claudesync/cli/sync.py b/src/claudesync/cli/sync.py index 06a21ff..913c050 100644 --- a/src/claudesync/cli/sync.py +++ b/src/claudesync/cli/sync.py @@ -55,7 +55,7 @@ def sync(config): local_files = get_local_files(local_path) # Track remote files to delete - remote_files_to_delete = set(rf['file_name'] for rf in remote_files) + remote_files_to_delete = set(rf["file_name"] for rf in remote_files) for local_file, local_checksum in local_files.items(): remote_file = next( @@ -69,7 +69,7 @@ def sync(config): active_organization_id, active_project_id, remote_file["uuid"] ) with open( - os.path.join(local_path, local_file), "r", encoding="utf-8" + os.path.join(local_path, local_file), "r", encoding="utf-8" ) as file: content = file.read() provider.upload_file( @@ -80,7 +80,7 @@ def sync(config): else: click.echo(f"Uploading new file {local_file} to remote...") with open( - os.path.join(local_path, local_file), "r", encoding="utf-8" + os.path.join(local_path, local_file), "r", encoding="utf-8" ) as file: content = file.read() provider.upload_file( @@ -91,7 +91,9 @@ def sync(config): # Delete remote files that no longer exist locally for file_to_delete in remote_files_to_delete: click.echo(f"Deleting {file_to_delete} from remote...") - remote_file = next(rf for rf in remote_files if rf["file_name"] == file_to_delete) + remote_file = next( + rf for rf in remote_files if rf["file_name"] == file_to_delete + ) provider.delete_file( active_organization_id, active_project_id, remote_file["uuid"] ) diff --git a/src/claudesync/config_manager.py b/src/claudesync/config_manager.py index 85e2482..3fc10f8 100644 --- a/src/claudesync/config_manager.py +++ b/src/claudesync/config_manager.py @@ -14,7 +14,7 @@ def _load_config(self): return { "log_level": "INFO", "upload_delay": 0.5, - "max_file_size": 32 * 1024 # Default 32 KB + "max_file_size": 32 * 1024, # Default 32 KB } with open(self.config_file, "r") as f: config = json.load(f) @@ -35,4 +35,4 @@ def get(self, key, default=None): def set(self, key, value): self.config[key] = value - self._save_config() \ No newline at end of file + self._save_config() diff --git a/src/claudesync/utils.py b/src/claudesync/utils.py index 0f45144..edf6516 100644 --- a/src/claudesync/utils.py +++ b/src/claudesync/utils.py @@ -1,6 +1,6 @@ import os import hashlib -import mimetypes +from chardet import detect from functools import wraps import click @@ -15,64 +15,79 @@ config_manager = ConfigManager() + def calculate_checksum(content): normalized_content = content.replace("\r\n", "\n").replace("\r", "\n").strip() return hashlib.md5(normalized_content.encode("utf-8")).hexdigest() def load_gitignore(base_path): - patterns = [] - current_dir = base_path - while True: - gitignore_path = os.path.join(current_dir, ".gitignore") - if os.path.exists(gitignore_path): - with open(gitignore_path, "r") as f: - patterns.extend(f.read().splitlines()) - - if os.path.exists(os.path.join(current_dir, ".git")): - break # Stop if we've reached the root of the Git repository - - parent_dir = os.path.dirname(current_dir) - if parent_dir == current_dir or parent_dir == base_path: - break # Stop if we've reached the filesystem root or the base watched directory - current_dir = parent_dir - - return pathspec.PathSpec.from_lines("gitwildmatch", patterns) if patterns else None - -def should_ignore(gitignore, local_path): - # Check file type - mime_type, _ = mimetypes.guess_type(local_path) - if mime_type and not mime_type.startswith("text/"): - return True - # Check if .git dir - if ".git" in local_path.split(os.sep): - return True - # Check if temporary editor file - if local_path.endswith("~"): - return True - # Check if too big - max_file_size = config_manager.get("max_file_size", 32 * 1024) # Default to 32 KB if not set - if os.path.getsize(local_path) > max_file_size: - return True - # Check .gitignore - return gitignore.match_file(local_path) if gitignore else False + gitignore_path = os.path.join(base_path, ".gitignore") + if os.path.exists(gitignore_path): + with open(gitignore_path, "r") as f: + return pathspec.PathSpec.from_lines("gitwildmatch", f) + return None + + +def is_text_file(file_path, sample_size=8192): + try: + with open(file_path, "rb") as file: + return b"\x00" not in file.read(sample_size) + except IOError: + return False + + +def calculate_checksum(content): + return hashlib.md5(content.encode("utf-8")).hexdigest() def get_local_files(local_path): gitignore = load_gitignore(local_path) files = {} - for root, _, filenames in os.walk(local_path): + + # List of directories to exclude + exclude_dirs = {".git", ".svn", ".hg", ".bzr", "_darcs", "CVS"} + + for root, dirs, filenames in os.walk(local_path): + # Remove excluded directories + dirs[:] = [d for d in dirs if d not in exclude_dirs] + + rel_root = os.path.relpath(root, local_path) + if rel_root == ".": + rel_root = "" + for filename in filenames: - file_path = os.path.join(root, filename) - if not should_ignore(gitignore, file_path): - rel_path = os.path.relpath(file_path, local_path) - try: - with open(file_path, "r", encoding="utf-8") as file: - content = file.read() - files[rel_path] = calculate_checksum(content) - except Exception as e: - logger.error(f"Error reading file {file_path}: {str(e)}") - continue + rel_path = os.path.join(rel_root, filename) + full_path = os.path.join(root, filename) + + # Skip files larger than 200KB + max_file_size = config_manager.get("max_file_size", 32 * 1024) + if os.path.getsize(full_path) > max_file_size: + continue + + # Skip temporary editor files + if filename.endswith("~"): + continue + + # Use gitignore rules if available + if gitignore and gitignore.match_file(rel_path): + continue + + # Check if it's a text file + if not is_text_file(full_path): + continue + + try: + with open(full_path, "r", encoding="utf-8") as file: + content = file.read() + files[rel_path] = calculate_checksum(content) + except UnicodeDecodeError: + # If UTF-8 decoding fails, it's likely not a text file we can handle + logger.debug(f"Unable to read {full_path} as UTF-8 text. Skipping.") + continue + except Exception as e: + logger.error(f"Error reading file {full_path}: {str(e)}") + return files