From 13021990f1ea610a68b7b3dc4eb6e3ee0dc1016c Mon Sep 17 00:00:00 2001 From: Rohit kumar Date: Sun, 30 Jun 2024 21:13:06 +0530 Subject: [PATCH] Added import support and improved docs --- hawk_scanner/commands/couchdb.py | 22 +-- hawk_scanner/commands/firebase.py | 24 +-- hawk_scanner/commands/fs.py | 25 ++- hawk_scanner/commands/gcs.py | 24 +-- hawk_scanner/commands/gdrive.py | 16 +- hawk_scanner/commands/gdrive_workspace.py | 14 +- hawk_scanner/commands/mongodb.py | 24 +-- hawk_scanner/commands/mysql.py | 20 +-- hawk_scanner/commands/postgresql.py | 20 +-- hawk_scanner/commands/redis.py | 18 +-- hawk_scanner/commands/s3.py | 30 ++-- hawk_scanner/commands/slack.py | 30 ++-- hawk_scanner/commands/text.py | 12 +- hawk_scanner/internals/system.py | 189 +++++++++++----------- hawk_scanner/main.py | 34 ++-- readme.md | 24 ++- setup.py | 2 +- 17 files changed, 273 insertions(+), 255 deletions(-) diff --git a/hawk_scanner/commands/couchdb.py b/hawk_scanner/commands/couchdb.py index 888634a..d4d6592 100644 --- a/hawk_scanner/commands/couchdb.py +++ b/hawk_scanner/commands/couchdb.py @@ -9,13 +9,13 @@ def connect_couchdb(host, port, username, password, database): try: server = couchdb.Server(f"http://{username}:{password}@{host}:{port}/") if database not in server: - system.print_error(f"Database {database} not found on CouchDB server.") + system.print_error(args, f"Database {database} not found on CouchDB server.") return None db = server[database] - system.print_info(f"Connected to CouchDB database") + system.print_info(args, f"Connected to CouchDB database") return db except Exception as e: - system.print_error(f"Failed to connect to CouchDB database with error: {e}") + system.print_error(args, f"Failed to connect to CouchDB database with error: {e}") return None def check_data_patterns(db, patterns, profile_name, database_name): @@ -25,7 +25,7 @@ def check_data_patterns(db, patterns, profile_name, database_name): for field_name, field_value in document.items(): if field_value: value_str = str(field_value) - matches = system.match_strings(value_str) + matches = system.match_strings(args, value_str) if matches: for match in matches: results.append({ @@ -44,15 +44,15 @@ def check_data_patterns(db, patterns, profile_name, database_name): def execute(args): results = [] - system.print_info(f"Running Checks for CouchDB Sources") - connections = system.get_connection() + system.print_info(args, f"Running Checks for CouchDB Sources") + connections = system.get_connection(args) if 'sources' in connections: sources_config = connections['sources'] couchdb_config = sources_config.get('couchdb') if couchdb_config: - patterns = system.get_fingerprint_file() + patterns = system.get_fingerprint_file(args) for key, config in couchdb_config.items(): host = config.get('host') @@ -62,16 +62,16 @@ def execute(args): database = config.get('database') if host and username and password and database: - system.print_info(f"Checking CouchDB Profile {key} with host and authentication") + system.print_info(args, f"Checking CouchDB Profile {key} with host and authentication") else: - system.print_error(f"Incomplete CouchDB configuration for key: {key}") + system.print_error(args, f"Incomplete CouchDB configuration for key: {key}") continue db = connect_couchdb(host, port, username, password, database) if db: results += check_data_patterns(db, patterns, key, database) else: - system.print_error("No CouchDB connection details found in connection.yml") + system.print_error(args, "No CouchDB connection details found in connection.yml") else: - system.print_error("No 'sources' section found in connection.yml") + system.print_error(args, "No 'sources' section found in connection.yml") return results diff --git a/hawk_scanner/commands/firebase.py b/hawk_scanner/commands/firebase.py index 64553c7..c159d76 100644 --- a/hawk_scanner/commands/firebase.py +++ b/hawk_scanner/commands/firebase.py @@ -10,7 +10,7 @@ def connect_firebase(credentials_file, bucket_name): cred = credentials.Certificate(credentials_file) firebase_admin.initialize_app(cred) bucket = storage.bucket(bucket_name) - system.print_info(f"Connected to Firebase bucket: {bucket_name}") + system.print_info(args, f"Connected to Firebase bucket: {bucket_name}") return bucket except Exception as e: print(f"Failed to connect to Firebase bucket: {e}") @@ -18,7 +18,7 @@ def connect_firebase(credentials_file, bucket_name): def execute(args): results = [] shouldDownload = True - connections = system.get_connection() + connections = system.get_connection(args) if 'sources' in connections: sources_config = connections['sources'] @@ -37,7 +37,7 @@ def execute(args): file_name = blob.name ## get unique etag or hash of file remote_etag = blob.etag - system.print_debug(f"Remote etag: {remote_etag}") + system.print_debug(args, f"Remote etag: {remote_etag}") if system.should_exclude_file(file_name, exclude_patterns): continue @@ -49,20 +49,20 @@ def execute(args): if os.path.exists(file_path): shouldDownload = False local_etag = file_path.split('/')[-1].split('-')[0] - system.print_debug(f"Local etag: {local_etag}") - system.print_debug(f"File already exists in cache, using it. You can disable cache by setting 'cache: false' in connection.yml") + system.print_debug(args, f"Local etag: {local_etag}") + system.print_debug(args, f"File already exists in cache, using it. You can disable cache by setting 'cache: false' in connection.yml") if remote_etag != local_etag: - system.print_debug(f"File in firebase bucket has changed, downloading it again...") + system.print_debug(args, f"File in firebase bucket has changed, downloading it again...") shouldDownload = True else: shouldDownload = False if shouldDownload: file_path = f"data/firebase/{remote_etag}-{file_name}" - system.print_debug(f"Downloading file: {file_name} to {file_path}...") + system.print_debug(args, f"Downloading file: {file_name} to {file_path}...") blob.download_to_filename(file_path) - matches = system.read_match_strings(file_path, 'google_cloud_storage') + matches = system.read_match_strings(args, file_path, 'google_cloud_storage') if matches: for match in matches: results.append({ @@ -76,13 +76,13 @@ def execute(args): }) else: - system.print_error(f"Failed to connect to Firebase bucket: {bucket_name}") + system.print_error(args, f"Failed to connect to Firebase bucket: {bucket_name}") else: - system.print_error(f"Incomplete Firebase configuration for key: {key}") + system.print_error(args, f"Incomplete Firebase configuration for key: {key}") else: - system.print_error("No Firebase connection details found in connection file") + system.print_error(args, "No Firebase connection details found in connection file") else: - system.print_error("No 'sources' section found in connection.yml") + system.print_error(args, "No 'sources' section found in connection.yml") if config.get("cache") == False: os.system("rm -rf data/firebase") diff --git a/hawk_scanner/commands/fs.py b/hawk_scanner/commands/fs.py index a74e452..ce7baca 100644 --- a/hawk_scanner/commands/fs.py +++ b/hawk_scanner/commands/fs.py @@ -6,8 +6,8 @@ import concurrent.futures import time -def process_file(file_path, key, results): - matches = system.read_match_strings(file_path, 'fs') +def process_file(args, file_path, key, results): + matches = system.read_match_strings(args, file_path, 'fs') file_data = system.getFileData(file_path) if matches: for match in matches: @@ -24,18 +24,18 @@ def process_file(file_path, key, results): def execute(args): results = [] - connections = system.get_connection() + connections = system.get_connection(args) if 'sources' in connections: sources_config = connections['sources'] fs_config = sources_config.get('fs') if fs_config: for key, config in fs_config.items(): if 'path' not in config: - system.print_error(f"Path not found in fs profile '{key}'") + system.print_error(args, f"Path not found in fs profile '{key}'") continue path = config.get('path') if not os.path.exists(path): - system.print_error(f"Path '{path}' does not exist") + system.print_error(args, f"Path '{path}' does not exist") exclude_patterns = fs_config.get(key, {}).get('exclude_patterns', []) start_time = time.time() @@ -51,21 +51,14 @@ def execute(args): futures = [] for file_path in files: file_count += 1 - futures.append(executor.submit(process_file, file_path, key, results)) + futures.append(executor.submit(process_file, args, file_path, key, results)) # Wait for all tasks to complete concurrent.futures.wait(futures) end_time = time.time() - system.print_info(f"Time taken to analyze {file_count} files: {end_time - start_time} seconds") + system.print_info(args, f"Time taken to analyze {file_count} files: {end_time - start_time} seconds") else: - system.print_error("No filesystem 'fs' connection details found in connection.yml") + system.print_error(args, "No filesystem 'fs' connection details found in connection.yml") else: - system.print_error("No 'sources' section found in connection.yml") + system.print_error(args, "No 'sources' section found in connection.yml") return results - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Add your command-line arguments here if needed - args = parser.parse_args() - results = execute(args) - # Handle results as needed diff --git a/hawk_scanner/commands/gcs.py b/hawk_scanner/commands/gcs.py index c22adf0..059fbe5 100644 --- a/hawk_scanner/commands/gcs.py +++ b/hawk_scanner/commands/gcs.py @@ -13,7 +13,7 @@ def connect_google_cloud(bucket_name, credentials_file): os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_file client = storage.Client() bucket = client.get_bucket(bucket_name) - system.print_debug(f"Connected to Google Cloud Storage bucket: {bucket_name}") + system.print_debug(args, f"Connected to Google Cloud Storage bucket: {bucket_name}") return bucket except Exception as e: print(f"Failed to connect to Google Cloud Storage bucket: {e}") @@ -25,7 +25,7 @@ def get_last_update_time(blob): def execute(args): results = [] shouldDownload = True - connections = system.get_connection() + connections = system.get_connection(args) if 'sources' in connections: sources_config = connections['sources'] @@ -44,7 +44,7 @@ def execute(args): file_name = blob.name ## get unique etag or hash of file remote_etag = get_last_update_time(blob) - system.print_debug(f"Remote etag: {remote_etag}") + system.print_debug(args, f"Remote etag: {remote_etag}") if system.should_exclude_file(file_name, exclude_patterns): continue @@ -56,19 +56,19 @@ def execute(args): if os.path.exists(file_path): shouldDownload = False local_etag = file_path.split('/')[-1].split('-')[0] - system.print_debug(f"Local etag: {local_etag}") - system.print_debug(f"File already exists in cache, using it. You can disable cache by setting 'cache: false' in connection.yml") + system.print_debug(args, f"Local etag: {local_etag}") + system.print_debug(args, f"File already exists in cache, using it. You can disable cache by setting 'cache: false' in connection.yml") if remote_etag != local_etag: - system.print_debug(f"File in Google Cloud Storage bucket has changed, downloading it again...") + system.print_debug(args, f"File in Google Cloud Storage bucket has changed, downloading it again...") shouldDownload = True else: shouldDownload = False if shouldDownload: - system.print_debug(f"Downloading file: {file_name} to {file_path}...") + system.print_debug(args, f"Downloading file: {file_name} to {file_path}...") blob.download_to_filename(file_path) - matches = system.read_match_strings(file_path, 'google_cloud_storage') + matches = system.read_match_strings(args, file_path, 'google_cloud_storage') if matches: for match in matches: results.append({ @@ -81,13 +81,13 @@ def execute(args): 'data_source': 'gcs' }) else: - system.print_error(f"Failed to connect to Google Cloud Storage bucket: {bucket_name}") + system.print_error(args, f"Failed to connect to Google Cloud Storage bucket: {bucket_name}") else: - system.print_error(f"Incomplete Google Cloud Storage configuration for key: {key}") + system.print_error(args, f"Incomplete Google Cloud Storage configuration for key: {key}") else: - system.print_error("No Google Cloud Storage connection details found in connection.yml") + system.print_error(args, "No Google Cloud Storage connection details found in connection.yml") else: - system.print_error("No 'sources' section found in connection.yml") + system.print_error(args, "No 'sources' section found in connection.yml") if config.get("cache") == False: os.system("rm -rf data/google_cloud_storage") return results diff --git a/hawk_scanner/commands/gdrive.py b/hawk_scanner/commands/gdrive.py index c457d18..d6c3cd2 100644 --- a/hawk_scanner/commands/gdrive.py +++ b/hawk_scanner/commands/gdrive.py @@ -16,7 +16,7 @@ def connect_google_drive(credentials_file): try: fs = GDriveFileSystem("root", client_id=client_id, client_secret=client_secret, token=credentials_file) - system.print_debug("Connected to Google Drive") + system.print_debug(args, "Connected to Google Drive") drive = fs.client return drive except Exception as e: @@ -53,7 +53,7 @@ def download_file(drive, file_obj, base_path): else: file_obj.GetContentFile(file_path) - system.print_debug(f"File downloaded to: {file_path}") + system.print_debug(args, f"File downloaded to: {file_path}") except Exception as e: print(f"Failed to download file: {e}") @@ -68,7 +68,7 @@ def list_files(drive, folder_name=None): def execute(args): results = [] should_download = True - connections = system.get_connection() + connections = system.get_connection(args) is_cache_enabled = False drive_config = None @@ -76,7 +76,7 @@ def execute(args): sources_config = connections['sources'] drive_config = sources_config.get('gdrive') else: - system.print_error("No 'sources' section found in connection.yml") + system.print_error(args, "No 'sources' section found in connection.yml") if drive_config: for key, config in drive_config.items(): @@ -113,14 +113,14 @@ def execute(args): if config.get("cache") and os.path.exists(file_path): should_download = False - system.print_debug(f"File already exists in cache, using it.") + system.print_debug(args, f"File already exists in cache, using it.") else: should_download = True if should_download: download_file(drive, file_obj, "data/google_drive") - matches = system.read_match_strings(file_path, 'gdrive') + matches = system.read_match_strings(args, file_path, 'gdrive') if matches: for match in matches: results.append({ @@ -134,9 +134,9 @@ def execute(args): 'data_source': 'gdrive' }) else: - system.print_error("Failed to connect to Google Drive") + system.print_error(args, "Failed to connect to Google Drive") else: - system.print_error("No Google Drive connection details found in connection file") + system.print_error(args, "No Google Drive connection details found in connection file") if not is_cache_enabled: os.system("rm -rf data/google_drive") diff --git a/hawk_scanner/commands/gdrive_workspace.py b/hawk_scanner/commands/gdrive_workspace.py index 364782a..26f86d4 100644 --- a/hawk_scanner/commands/gdrive_workspace.py +++ b/hawk_scanner/commands/gdrive_workspace.py @@ -64,7 +64,7 @@ def download_file(drive, file_obj, base_path): except Exception as e: print(f"Failed to write file: {e}") - system.print_debug(f"File downloaded to: {folder_path}") + system.print_debug(args, f"File downloaded to: {folder_path}") except Exception as e: print(f"Failed to download file: {e}") @@ -83,14 +83,14 @@ def list_files(drive, impersonate_user=None): def execute(args): results = [] - connections = system.get_connection() + connections = system.get_connection(args) is_cache_enabled = False if 'sources' in connections: sources_config = connections['sources'] drive_config = sources_config.get('gdrive_workspace') else: - system.print_error("No 'sources' section found in connection.yml") + system.print_error(args, "No 'sources' section found in connection.yml") if drive_config: for key, config in drive_config.items(): @@ -121,14 +121,14 @@ def execute(args): if config.get("cache") and os.path.exists(file_path): is_cache_enabled = False - system.print_debug(f"File already exists in cache, using it.") + system.print_debug(args, f"File already exists in cache, using it.") else: is_cache_enabled = True if is_cache_enabled: download_file(drive, file_obj, "data/google_drive/") - matches = system.read_match_strings(file_path, 'gdrive_workspace') + matches = system.read_match_strings(args, file_path, 'gdrive_workspace') file_name = file_name.replace('-runtime.pdf', '') if matches: for match in matches: @@ -144,9 +144,9 @@ def execute(args): 'data_source': 'gdrive_workspace' }) else: - system.print_error("Failed to connect to Google Drive") + system.print_error(args, "Failed to connect to Google Drive") else: - system.print_error("No Google Drive connection details found in connection file") + system.print_error(args, "No Google Drive connection details found in connection file") """if not is_cache_enabled: os.system("rm -rf data/google_drive")""" diff --git a/hawk_scanner/commands/mongodb.py b/hawk_scanner/commands/mongodb.py index 7ce6359..f532cc4 100644 --- a/hawk_scanner/commands/mongodb.py +++ b/hawk_scanner/commands/mongodb.py @@ -12,14 +12,14 @@ def connect_mongodb(host, port, username, password, database, uri=None): client = pymongo.MongoClient(host=host, port=port, username=username, password=password) if database not in client.list_database_names(): - system.print_error(f"Database {database} not found on MongoDB server.") + system.print_error(args, f"Database {database} not found on MongoDB server.") return None db = client[database] - system.print_info(f"Connected to MongoDB database") + system.print_info(args, f"Connected to MongoDB database") return db except Exception as e: - system.print_error(f"Failed to connect to MongoDB database with error: {e}") + system.print_error(args, f"Failed to connect to MongoDB database with error: {e}") return None @@ -42,7 +42,7 @@ def check_data_patterns(db, patterns, profile_name, database_name, limit_start=0 for field_name, field_value in document.items(): if field_value: value_str = str(field_value) - matches = system.match_strings(value_str) + matches = system.match_strings(args, value_str) if matches: for match in matches: results.append({ @@ -61,15 +61,15 @@ def check_data_patterns(db, patterns, profile_name, database_name, limit_start=0 def execute(args): results = [] - system.print_info(f"Running Checks for MongoDB Sources") - connections = system.get_connection() + system.print_info(args, f"Running Checks for MongoDB Sources") + connections = system.get_connection(args) if 'sources' in connections: sources_config = connections['sources'] mongodb_config = sources_config.get('mongodb') if mongodb_config: - patterns = system.get_fingerprint_file() + patterns = system.get_fingerprint_file(args) for key, config in mongodb_config.items(): host = config.get('host') @@ -83,20 +83,20 @@ def execute(args): collections = config.get('collections', []) if uri: - system.print_info(f"Checking MongoDB Profile {key} using URI") + system.print_info(args, f"Checking MongoDB Profile {key} using URI") elif host and username and password and database: - system.print_info(f"Checking MongoDB Profile {key} with host and authentication") + system.print_info(args, f"Checking MongoDB Profile {key} with host and authentication") else: - system.print_error(f"Incomplete MongoDB configuration for key: {key}") + system.print_error(args, f"Incomplete MongoDB configuration for key: {key}") continue db = connect_mongodb(host, port, username, password, database, uri) if db: results += check_data_patterns(db, patterns, key, database, limit_start=limit_start, limit_end=limit_end, whitelisted_collections=collections) else: - system.print_error("No MongoDB connection details found in connection.yml") + system.print_error(args, "No MongoDB connection details found in connection.yml") else: - system.print_error("No 'sources' section found in connection.yml") + system.print_error(args, "No 'sources' section found in connection.yml") return results # Example usage diff --git a/hawk_scanner/commands/mysql.py b/hawk_scanner/commands/mysql.py index c32c51c..fe43886 100644 --- a/hawk_scanner/commands/mysql.py +++ b/hawk_scanner/commands/mysql.py @@ -14,10 +14,10 @@ def connect_mysql(host, port, user, password, database): database=database ) if conn: - system.print_info(f"Connected to MySQL database at {host}") + system.print_info(args, f"Connected to MySQL database at {host}") return conn except Exception as e: - system.print_error(f"Failed to connect to MySQL database at {host} with error: {e}") + system.print_error(args, f"Failed to connect to MySQL database at {host} with error: {e}") def check_data_patterns(conn, patterns, profile_name, database_name, limit_start=0, limit_end=500, whitelisted_tables=None): cursor = conn.cursor() @@ -42,7 +42,7 @@ def check_data_patterns(conn, patterns, profile_name, database_name, limit_start for column, value in zip(columns, row): if value: value_str = str(value) - matches = system.match_strings(value_str) + matches = system.match_strings(args, value_str) if matches: for match in matches: results.append({ @@ -66,14 +66,14 @@ def check_data_patterns(conn, patterns, profile_name, database_name, limit_start def execute(args): results = [] - system.print_info(f"Running Checks for MySQL Sources") - connections = system.get_connection() + system.print_info(args, f"Running Checks for MySQL Sources") + connections = system.get_connection(args) if 'sources' in connections: sources_config = connections['sources'] mysql_config = sources_config.get('mysql') if mysql_config: - patterns = system.get_fingerprint_file() + patterns = system.get_fingerprint_file(args) for key, config in mysql_config.items(): host = config.get('host') @@ -86,17 +86,17 @@ def execute(args): tables = config.get('tables', []) if host and user and database: - system.print_info(f"Checking MySQL Profile {key} and database {database}") + system.print_info(args, f"Checking MySQL Profile {key} and database {database}") conn = connect_mysql(host, port, user, password, database) if conn: results += check_data_patterns(conn, patterns, key, database, limit_start=limit_start, limit_end=limit_end, whitelisted_tables=tables) conn.close() else: - system.print_error(f"Incomplete MySQL configuration for key: {key}") + system.print_error(args, f"Incomplete MySQL configuration for key: {key}") else: - system.print_error("No MySQL connection details found in connection.yml") + system.print_error(args, "No MySQL connection details found in connection.yml") else: - system.print_error("No 'sources' section found in connection.yml") + system.print_error(args, "No 'sources' section found in connection.yml") return results # Example usage diff --git a/hawk_scanner/commands/postgresql.py b/hawk_scanner/commands/postgresql.py index 282dc0e..141edbd 100644 --- a/hawk_scanner/commands/postgresql.py +++ b/hawk_scanner/commands/postgresql.py @@ -14,10 +14,10 @@ def connect_postgresql(host, port, user, password, database): database=database ) if conn: - system.print_info(f"Connected to PostgreSQL database at {host}") + system.print_info(args, f"Connected to PostgreSQL database at {host}") return conn except Exception as e: - system.print_error(f"Failed to connect to PostgreSQL database at {host} with error: {e}") + system.print_error(args, f"Failed to connect to PostgreSQL database at {host} with error: {e}") def check_data_patterns(conn, patterns, profile_name, database_name, limit_start=0, limit_end=500, whitelisted_tables=None): cursor = conn.cursor() @@ -46,7 +46,7 @@ def check_data_patterns(conn, patterns, profile_name, database_name, limit_start for column, value in zip(columns, row): if value: value_str = str(value) - matches = system.match_strings(value_str) + matches = system.match_strings(args, value_str) if matches: for match in matches: results.append({ @@ -70,15 +70,15 @@ def check_data_patterns(conn, patterns, profile_name, database_name, limit_start def execute(args): results = [] - system.print_info(f"Running Checks for PostgreSQL Sources") - connections = system.get_connection() + system.print_info(args, f"Running Checks for PostgreSQL Sources") + connections = system.get_connection(args) if 'sources' in connections: sources_config = connections['sources'] postgresql_config = sources_config.get('postgresql') if postgresql_config: - patterns = system.get_fingerprint_file() + patterns = system.get_fingerprint_file(args) for key, config in postgresql_config.items(): host = config.get('host') @@ -91,17 +91,17 @@ def execute(args): tables = config.get('tables', []) if host and user and password and database: - system.print_info(f"Checking PostgreSQL Profile {key}, database {database}") + system.print_info(args, f"Checking PostgreSQL Profile {key}, database {database}") conn = connect_postgresql(host, port, user, password, database) if conn: results += check_data_patterns(conn, patterns, key, database, limit_start=limit_start, limit_end=limit_end, whitelisted_tables=tables) conn.close() else: - system.print_error(f"Incomplete PostgreSQL configuration for key: {key}") + system.print_error(args, f"Incomplete PostgreSQL configuration for key: {key}") else: - system.print_error("No PostgreSQL connection details found in connection.yml") + system.print_error(args, "No PostgreSQL connection details found in connection.yml") else: - system.print_error("No 'sources' section found in connection.yml") + system.print_error(args, "No 'sources' section found in connection.yml") return results # Example usage diff --git a/hawk_scanner/commands/redis.py b/hawk_scanner/commands/redis.py index 91cd5f7..03dc11c 100644 --- a/hawk_scanner/commands/redis.py +++ b/hawk_scanner/commands/redis.py @@ -10,12 +10,12 @@ def connect_redis(host, port, password=None): try: r = redis.Redis(host=host, port=port, password=password) if r.ping(): - system.print_info(f"Redis instance at {host}:{port} is accessible") + system.print_info(args, f"Redis instance at {host}:{port} is accessible") return r else: - system.print_error(f"Redis instance at {host}:{port} is not accessible") + system.print_error(args, f"Redis instance at {host}:{port} is not accessible") except Exception as e: - system.print_error(f"Redis instance at {host}:{port} is not accessible with error: {e}") + system.print_error(args, f"Redis instance at {host}:{port} is not accessible with error: {e}") def get_patterns_from_file(file_path): with open(file_path, 'r', encoding='utf-8') as file: @@ -30,7 +30,7 @@ def check_data_patterns(redis_instance, patterns, profile_name, host): data = redis_instance.get(key) if data: data_str = data.decode('utf-8') - matches = system.match_strings(data_str) + matches = system.match_strings(args, data_str) if matches: for match in matches: results.append({ @@ -46,14 +46,14 @@ def check_data_patterns(redis_instance, patterns, profile_name, host): def execute(args): results = [] - connections = system.get_connection() + connections = system.get_connection(args) if 'sources' in connections: sources_config = connections['sources'] redis_config = sources_config.get('redis') if redis_config: - patterns = system.get_fingerprint_file() + patterns = system.get_fingerprint_file(args) for profile_name, config in redis_config.items(): host = config.get('host') @@ -66,9 +66,9 @@ def execute(args): results = check_data_patterns(redis_instance, patterns, profile_name, host) redis_instance.close() else: - system.print_error(f"Incomplete Redis configuration for key: {profile_name}") + system.print_error(args, f"Incomplete Redis configuration for key: {profile_name}") else: - system.print_error("No Redis connection details found in connection.yml") + system.print_error(args, "No Redis connection details found in connection.yml") else: - system.print_error("No 'sources' section found in connection.yml") + system.print_error(args, "No 'sources' section found in connection.yml") return results diff --git a/hawk_scanner/commands/s3.py b/hawk_scanner/commands/s3.py index cec5dbc..d3b21ef 100644 --- a/hawk_scanner/commands/s3.py +++ b/hawk_scanner/commands/s3.py @@ -15,10 +15,10 @@ def connect_s3(access_key, secret_key, bucket_name): ) s3 = session.resource('s3') bucket = s3.Bucket(bucket_name) - system.print_info(f"Connected to S3 bucket: {bucket_name}") + system.print_info(args, f"Connected to S3 bucket: {bucket_name}") return bucket except Exception as e: - system.print_error(f"[bold red]Failed[/bold red] to connect to S3 bucket: {e}") + system.print_error(args, f"[bold red]Failed[/bold red] to connect to S3 bucket: {e}") def get_last_update_time(obj): last_modified = obj.last_modified @@ -34,8 +34,8 @@ def get_patterns_from_file(file_path): def execute(args): results = [] shouldDownload = True - system.print_info(f"Running Checks for S3 Sources") - connections = system.get_connection() + system.print_info(args, f"Running Checks for S3 Sources") + connections = system.get_connection(args) if 'sources' in connections: sources_config = connections['sources'] s3_config = sources_config.get('s3') @@ -47,7 +47,7 @@ def execute(args): bucket_name = config.get('bucket_name') exclude_patterns = config.get(key, {}).get('exclude_patterns', []) - system.print_info(f"Checking S3 profile: '{key}' with bucket '{bucket_name}'") + system.print_info(args, f"Checking S3 profile: '{key}' with bucket '{bucket_name}'") profile_name = key if access_key and secret_key and bucket_name: bucket = connect_s3(access_key, secret_key, bucket_name) @@ -55,7 +55,7 @@ def execute(args): for obj in bucket.objects.all(): remote_etag = obj.e_tag.replace('"', '') - system.print_debug(f"Remote etag: {remote_etag}") + system.print_debug(args, f"Remote etag: {remote_etag}") file_name = obj.key if system.should_exclude_file(file_name, exclude_patterns): continue @@ -66,20 +66,20 @@ def execute(args): if os.path.exists(file_path): shouldDownload = False local_etag = file_path.split('/')[-1].split('-')[0] - system.print_debug(f"Local etag: {local_etag}") - system.print_debug(f"File already exists in cache, using it. You can disable cache by setting 'cache: false' in connection.yml") + system.print_debug(args, f"Local etag: {local_etag}") + system.print_debug(args, f"File already exists in cache, using it. You can disable cache by setting 'cache: false' in connection.yml") if remote_etag != local_etag: - system.print_debug(f"File in S3 bucket has changed, downloading it again...") + system.print_debug(args, f"File in S3 bucket has changed, downloading it again...") shouldDownload = True else: shouldDownload = False if shouldDownload: file_path = f"data/s3/{remote_etag}-{file_name}" - system.print_debug(f"Downloading file: {file_name} to {file_path}...") + system.print_debug(args, f"Downloading file: {file_name} to {file_path}...") bucket.download_file(file_name, file_path) - matches = system.read_match_strings(file_path, 'google_cloud_storage') + matches = system.read_match_strings(args, file_path, 'google_cloud_storage') if matches: for match in matches: results.append({ @@ -93,13 +93,13 @@ def execute(args): }) else: - system.print_error(f"Failed to connect to S3 bucket: {bucket_name}") + system.print_error(args, f"Failed to connect to S3 bucket: {bucket_name}") else: - system.print_error(f"Incomplete S3 configuration for key: {key}") + system.print_error(args, f"Incomplete S3 configuration for key: {key}") if config.get("cache") == False: os.system("rm -rf data/s3") else: - system.print_error("No S3 connection details found in connection.yml") + system.print_error(args, "No S3 connection details found in connection.yml") else: - system.print_error("No 'sources' section found in connection.yml") + system.print_error(args, "No 'sources' section found in connection.yml") return results diff --git a/hawk_scanner/commands/slack.py b/hawk_scanner/commands/slack.py index a2e09e8..1e71a7f 100644 --- a/hawk_scanner/commands/slack.py +++ b/hawk_scanner/commands/slack.py @@ -12,13 +12,13 @@ def connect_slack(token): # Test the connection by making an API call response = client.auth_test() if response["ok"]: - system.print_info("Connected to Slack") + system.print_info(args, "Connected to Slack") return client else: - system.print_error("Failed to authenticate with Slack") + system.print_error(args, "Failed to authenticate with Slack") return None except SlackApiError as e: - system.print_error(f"Failed to connect to Slack with error: {e.response['error']}") + system.print_error(args, f"Failed to connect to Slack with error: {e.response['error']}") return None def check_slack_messages(client, patterns, profile_name, channel_types, channel_names=None): @@ -33,22 +33,22 @@ def check_slack_messages(client, patterns, profile_name, channel_types, channel_ if channel_names: channels = [channel for channel in channels if channel['name'] in channel_names] - system.print_info(f"Found {len(channels)} channels of type {channel_types}") - system.print_info(f"Checking messages in channels: {', '.join([channel['name'] for channel in channels])}") + system.print_info(args, f"Found {len(channels)} channels of type {channel_types}") + system.print_info(args, f"Checking messages in channels: {', '.join([channel['name'] for channel in channels])}") for channel in channels: channel_name = channel["name"] channel_id = channel["id"] # Get messages from the channel - system.print_info(f"Checking messages in channel {channel_name} ({channel_id})") + system.print_info(args, f"Checking messages in channel {channel_name} ({channel_id})") messages = client.conversations_history(channel=channel_id)["messages"] for message in messages: user = message.get("user", "") text = message.get("text") if text: - matches = system.match_strings(text) + matches = system.match_strings(args, text) if matches: for match in matches: results.append({ @@ -64,20 +64,20 @@ def check_slack_messages(client, patterns, profile_name, channel_types, channel_ }) return results except SlackApiError as e: - system.print_error(f"Failed to fetch messages from Slack with error: {e.response['error']}") + system.print_error(args, f"Failed to fetch messages from Slack with error: {e.response['error']}") return results def execute(args): results = [] - system.print_info("Running Checks for Slack Sources") - connections = system.get_connection() + system.print_info(args, "Running Checks for Slack Sources") + connections = system.get_connection(args) if 'sources' in connections: sources_config = connections['sources'] slack_config = sources_config.get('slack') if slack_config: - patterns = system.get_fingerprint_file() + patterns = system.get_fingerprint_file(args) for key, config in slack_config.items(): token = config.get('token') @@ -85,17 +85,17 @@ def execute(args): channel_names = config.get('channel_names', None) if token: - system.print_info(f"Checking Slack Profile {key}") + system.print_info(args, f"Checking Slack Profile {key}") else: - system.print_error(f"Incomplete Slack configuration for key: {key}") + system.print_error(args, f"Incomplete Slack configuration for key: {key}") continue client = connect_slack(token) if client: results += check_slack_messages(client, patterns, key, channel_types, channel_names) else: - system.print_error("No Slack connection details found in connection.yml") + system.print_error(args, "No Slack connection details found in connection.yml") else: - system.print_error("No 'sources' section found in connection.yml") + system.print_error(args, "No 'sources' section found in connection.yml") return results \ No newline at end of file diff --git a/hawk_scanner/commands/text.py b/hawk_scanner/commands/text.py index 5baa088..33c64aa 100644 --- a/hawk_scanner/commands/text.py +++ b/hawk_scanner/commands/text.py @@ -5,7 +5,7 @@ def check_data_patterns(value, patterns, profile_name): value_str = str(value) - matches = system.match_strings(value_str) + matches = system.match_strings(args, value_str) results = [] if matches: for match in matches: @@ -20,9 +20,9 @@ def check_data_patterns(value, patterns, profile_name): def execute(args): results = [] - system.print_info(f"Running Checks for Simple text") - connections = system.get_connection() - patterns = system.get_fingerprint_file() + system.print_info(args, f"Running Checks for Simple text") + connections = system.get_connection(args) + patterns = system.get_fingerprint_file(args) if 'sources' in connections: sources_config = connections['sources'] text_config = sources_config.get('text') @@ -32,9 +32,9 @@ def execute(args): text = config.get('text', None) results += check_data_patterns(text, patterns, key) else: - system.print_error("No text connection details found in connection.yml") + system.print_error(args, "No text connection details found in connection.yml") else: - system.print_error("No 'sources' section found in connection.yml") + system.print_error(args, "No 'sources' section found in connection.yml") return results # Example usage diff --git a/hawk_scanner/internals/system.py b/hawk_scanner/internals/system.py index 9b02171..43dfffd 100644 --- a/hawk_scanner/internals/system.py +++ b/hawk_scanner/internals/system.py @@ -16,50 +16,41 @@ data_sources = ['s3', 'mysql', 'redis', 'firebase', 'gcs', 'fs', 'postgresql', 'mongodb', 'slack', 'couchdb', 'gdrive', 'gdrive_workspace', 'text'] data_sources_option = ['all'] + data_sources -parser = argparse.ArgumentParser(description='🦅 A powerful scanner to scan your Filesystem, S3, MySQL, PostgreSQL, MongoDB, Redis, Google Cloud Storage and Firebase storage for PII and sensitive data.') -parser.add_argument('command', nargs='?', choices=data_sources_option, help='Command to execute') -parser.add_argument('--connection', action='store', help='YAML Connection file path') -parser.add_argument('--connection-json', type=str, help='Connection details in JSON format, useful for passing connection info directly as CLI Input') -parser.add_argument('--fingerprint', action='store', help='Override YAML fingerprint file path') -parser.add_argument('--json', help='Save output to a json file') -parser.add_argument('--stdout', action='store_true', help='Print output to stdout in JSON format') -parser.add_argument('--quiet', action='store_true', help='Print only the results') -parser.add_argument('--debug', action='store_true', help='Enable debug mode') -parser.add_argument('--no-write', action='store_true', help='Do not write previous alerts to file, this may flood you with duplicate alerts') -parser.add_argument('--shutup', action='store_true', help='Suppress the Hawk Eye banner 🫣', default=False) - -args = parser.parse_args() - -# Create a TinyDB instance for storing previous alert hashes -db = None - -if not args.no_write: - db = TinyDB('previous_alerts.json') - -if args.quiet: - args.shutup = True +def parse_args(args=None): + parser = argparse.ArgumentParser(description='🦅 A powerful scanner to scan your Filesystem, S3, MySQL, PostgreSQL, MongoDB, Redis, Google Cloud Storage and Firebase storage for PII and sensitive data.') + parser.add_argument('command', nargs='?', choices=data_sources_option, help='Command to execute') + parser.add_argument('--connection', action='store', help='YAML Connection file path') + parser.add_argument('--connection-json', type=str, help='Connection details in JSON format, useful for passing connection info directly as CLI Input') + parser.add_argument('--fingerprint', action='store', help='Override YAML fingerprint file path') + parser.add_argument('--json', help='Save output to a json file') + parser.add_argument('--stdout', action='store_true', help='Print output to stdout in JSON format') + parser.add_argument('--quiet', action='store_true', help='Print only the results') + parser.add_argument('--debug', action='store_true', help='Enable debug mode') + parser.add_argument('--no-write', action='store_true', help='Do not write previous alerts to file, this may flood you with duplicate alerts') + parser.add_argument('--shutup', action='store_true', help='Suppress the Hawk Eye banner 🫣', default=False) + return parser.parse_args(args) console = Console() def calculate_msg_hash(msg): return hashlib.sha256(msg.encode()).hexdigest() -def print_info(message): +def print_info(args, message): if not args.quiet: console.print(f"[yellow][INFO][/yellow] {str(message)}") -def print_debug(message): - if args.debug and not args.quiet: +def print_debug(args, message): + if args and type(args) == argparse.Namespace and args.debug and not args.quiet: try: console.print(f"[blue][DEBUG][/blue] {str(message)}") except Exception as e: pass -def print_error(message): +def print_error(args, message): if not args.quiet: console.print(f"[bold red]❌ {message}") -def print_success(message): +def print_success(args, message): if not args.quiet: console.print(f"[bold green]✅ {message}") @@ -118,53 +109,57 @@ def RedactData(input_string): return redacted_string -def get_connection(): +def get_connection(args): if args.connection: if os.path.exists(args.connection): with open(args.connection, 'r') as file: connections = yaml.safe_load(file) return connections else: - print_error(f"Connection file not found: {args.connection}") + print_error(args, f"Connection file not found: {args.connection}") exit(1) elif args.connection_json: try: connections = json.loads(args.connection_json) return connections except json.JSONDecodeError as e: - print_error(f"Error parsing JSON: {e}") + print_error(args, f"Error parsing JSON: {e}") exit(1) else: - print_error("Please provide a connection file using --connection flag or connection details using --connection-json flag") + print_error(args, "Please provide a connection file using --connection flag or connection details using --connection-json flag") exit(1) -def get_fingerprint_file(): - if args.fingerprint: +def get_fingerprint_file(args=None): + if args and type(args) == argparse.Namespace and args.fingerprint: if os.path.exists(args.fingerprint): with open(args.fingerprint, 'r') as file: return yaml.safe_load(file) else: - print_error(f"Fingerprint file not found: {args.fingerprint}") + if args: + print_error(args, f"Fingerprint file not found: {args.fingerprint}") exit(1) + elif args and type(args) == dict and 'fingerprint' in args: + return args['fingerprint'] else: file_path = "https://github.com/rohitcoder/hawk-eye/raw/main/fingerprint.yml" try: response = requests.get(file_path, timeout=10) - print_info(f"Downloading default fingerprint.yml from {file_path}") + if args: + print_info(args, f"Downloading default fingerprint.yml from {file_path}") if response.status_code == 200: with open('fingerprint.yml', 'wb') as file: file.write(response.content) return yaml.safe_load(response.content) else: - print_error(f"Unable to download default fingerprint.yml please provide your own fingerprint file using --fingerprint flag") + if args: + print_error(args, f"Unable to download default fingerprint.yml please provide your own fingerprint file using --fingerprint flag") exit(1) except Exception as e: - print_error(f"Unable to download default fingerprint.yml please provide your own fingerprint file using --fingerprint flag") + if args: + print_error(args, f"Unable to download default fingerprint.yml please provide your own fingerprint file using --fingerprint flag") exit(1) -patterns = get_fingerprint_file() - -def print_banner(): +def print_banner(args): banner = r""" /T /I / |/ | .-~/ @@ -199,37 +194,42 @@ def print_banner(): (_/ / | | j-" ~^~^ ~-<_(_.^-~" """ + if args.quiet: + args.shutup = True if not args.shutup: console.print(banner) -connections = get_connection() - -def match_strings(content, source='text'): +def match_strings(args, content, source='text'): + redacted = False + if args and 'connection' in args: + connections = get_connection(args) + if 'notify' in connections: + redacted: bool = connections.get('notify', {}).get('redacted', False) + + patterns = get_fingerprint_file(args) matched_strings = [] - if 'notify' in connections: - redacted: bool = connections.get('notify', {}).get('redacted', False) - else: - redacted = False - for pattern_name, pattern_regex in patterns.items(): - print_debug(f"Matching pattern: {pattern_name}") + if args: + print_debug(args, f"Matching pattern: {pattern_name}") found = {} ## parse pattern_regex as Regex complied_regex = re.compile(pattern_regex, re.IGNORECASE) - print_debug(f"Regex: {complied_regex}") - print_debug(f"Content: {content}") + if args: + print_debug(args, f"Regex: {complied_regex}") + print_debug(args, f"Content: {content}") matches = re.findall(complied_regex, content) - print_debug(f"Matches: {matches}") + print_debug(args, f"Matches: {matches}") found['data_source'] = source if matches: - print_debug(f"Found {len(matches)} matches for pattern: {pattern_name}") + print_debug(args, f"Found {len(matches)} matches for pattern: {pattern_name}") found['pattern_name'] = pattern_name redacted_matches = [] if redacted: - print_debug(f"Redacting matches for pattern: {pattern_name}") + if args: + print_debug(args, f"Redacting matches for pattern: {pattern_name}") for match in matches: - print_debug(f"Redacting match: {match}") + print_debug(args, f"Redacting match: {match}") redacted_matches.append(RedactData(match)) found['matches'] = redacted_matches else: @@ -241,18 +241,19 @@ def match_strings(content, source='text'): found['sample_text'] = content[:50] matched_strings.append(found) - print_debug(f"Matched strings: {matched_strings}") + if args: + print_debug(args, f"Matched strings: {matched_strings}") return matched_strings def should_exclude_file(file_name, exclude_patterns): _, extension = os.path.splitext(file_name) if extension in exclude_patterns: - print_debug(f"Excluding file: {file_name} because of extension: {extension}") + print_debug(args, f"Excluding file: {file_name} because of extension: {extension}") return True for pattern in exclude_patterns: if pattern in file_name: - print_debug(f"Excluding file: {file_name} because of pattern: {pattern}") + print_debug(args, f"Excluding file: {file_name} because of pattern: {pattern}") return True return False @@ -270,37 +271,40 @@ def list_all_files_iteratively(path, exclude_patterns): if not should_exclude_file(file, exclude_patterns): yield os.path.join(root, file) -def read_match_strings(file_path, source): - print_info(f"Scanning file: {file_path} for Source: {source}") +def scan_file(file_path, args=None, source=None): content = '' is_archive = False + # Check if the file is an image + if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')): + content = enhance_and_ocr(file_path) + # Check if the file is a PDF document + elif file_path.lower().endswith('.pdf'): + content = read_pdf(file_path) + # Check if the file is an office document (Word, Excel, PowerPoint) + elif file_path.lower().endswith(('.docx', '.xlsx', '.pptx')): + content = read_office_document(file_path) + # Check if the file is an archive (zip, rar, tar, tar.gz) + elif file_path.lower().endswith(('.zip', '.rar', '.tar', '.tar.gz')): + ## this is archive, so we need to extract it and find pii from it, and return matched_strings + matched_strings = find_pii_in_archive(file_path, source) + is_archive = True + else: + # For other file types, read content normally + with open(file_path, 'rb') as file: + # Attempt to decode using UTF-8, fallback to 'latin-1' if needed + content = file.read().decode('utf-8', errors='replace') + + if not is_archive: + matched_strings = match_strings(args, content, source) + return matched_strings + +def read_match_strings(args, file_path, source): + print_info(args, f"Scanning file: {file_path} for Source: {source}") try: - # Check if the file is an image - if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')): - content = enhance_and_ocr(file_path) - # Check if the file is a PDF document - elif file_path.lower().endswith('.pdf'): - content = read_pdf(file_path) - # Check if the file is an office document (Word, Excel, PowerPoint) - elif file_path.lower().endswith(('.docx', '.xlsx', '.pptx')): - content = read_office_document(file_path) - # Check if the file is an archive (zip, rar, tar, tar.gz) - elif file_path.lower().endswith(('.zip', '.rar', '.tar', '.tar.gz')): - ## this is archive, so we need to extract it and find pii from it, and return matched_strings - matched_strings = find_pii_in_archive(file_path, source) - is_archive = True - else: - # For other file types, read content normally - with open(file_path, 'rb') as file: - # Attempt to decode using UTF-8, fallback to 'latin-1' if needed - content = file.read().decode('utf-8', errors='replace') + matched_strings = scan_file(file_path, args, source) except Exception as e: - print_debug(f"Error in read_match_strings: {e}") - pass - - if not is_archive: - matched_strings = match_strings(content, source) - + print_debug(args, f"Error in read_match_strings: {e}") + matched_strings = [] return matched_strings def read_pdf(file_path): @@ -317,7 +321,7 @@ def read_pdf(file_path): # Handle decoding errors by trying a different encoding content += page.extract_text(encoding='latin-1') except Exception as e: - print_debug(f"Error in read_pdf: {e}") + print_debug(args, f"Error in read_pdf: {e}") return content @@ -343,7 +347,7 @@ def read_office_document(file_path): # You can add specific logic for PowerPoint if needed pass except Exception as e: - print_debug(f"Error in read_office_document: {e}") + print_debug(args, f"Error in read_office_document: {e}") return content def find_pii_in_archive(file_path, source): @@ -365,7 +369,7 @@ def find_pii_in_archive(file_path, source): for root, dirs, files in os.walk(tmp_dir): for file in files: file_path = os.path.join(root, file) - data = read_match_strings(file_path, source) + data = read_match_strings(args, file_path, source) for d in data: content.append(d) # Clean up the temporary directory @@ -399,7 +403,10 @@ def getFileData(file_path): return json.dumps({"error": str(e)}) -def SlackNotify(msg): +def SlackNotify(msg, args): + connections = get_connection(args) + if not args.no_write: + db = TinyDB('previous_alerts.json') if 'notify' in connections: notify_config = connections['notify'] # Check if suppress_duplicates is set to True @@ -412,7 +419,7 @@ def SlackNotify(msg): # Check if the message hash already exists in the previous alerts database Alert = Query() if db.contains(Alert.msg_hash == msg_hash): - print_debug("Duplicate message detected. Skipping webhook trigger.") + print_info(args, "Duplicate message detected. Skipping webhook trigger.") return slack_config = notify_config.get('slack', {}) @@ -429,7 +436,7 @@ def SlackNotify(msg): # Store the message hash in the previous alerts database db.insert({'msg_hash': msg_hash}) except Exception as e: - print_error(f"An error occurred: {str(e)}") + print_error(args, f"An error occurred: {str(e)}") def enhance_and_ocr(image_path): # Load the image diff --git a/hawk_scanner/main.py b/hawk_scanner/main.py index acf6fc5..ea040ae 100644 --- a/hawk_scanner/main.py +++ b/hawk_scanner/main.py @@ -16,10 +16,8 @@ def clear_screen(): clear_screen() -system.print_banner() console = Console() -args = system.args def load_command_module(command): try: module = importlib.import_module(f"hawk_scanner.commands.{command}") @@ -39,6 +37,8 @@ def execute_command(command, args): def main(): + args = system.parse_args() + system.print_banner(args) results = [] if args.command: if args.command == 'all': @@ -49,8 +49,8 @@ def main(): else: for data in execute_command(args.command, args): results.append(data) - else: - system.print_error("Please provide a command to execute") + else: + system.print_error(args, "Please provide a command to execute") ## GROUP results in grouped_results by datasource by key val grouped_results = {} @@ -71,7 +71,7 @@ def main(): file.write(json.dumps(grouped_results, indent=4)) else: print(json.dumps(grouped_results, indent=4)) - system.print_success(f"Results saved to {args.json}") + system.print_success(args, f"Results saved to {args.json}") sys.exit(0) if args.stdout: @@ -141,7 +141,7 @@ def main(): exposed_values=records_mini ) - system.SlackNotify(AlertMsg) + system.SlackNotify(AlertMsg, args) elif group == 'mysql': table.add_row( @@ -176,7 +176,7 @@ def main(): exposed_values=records_mini ) - system.SlackNotify(AlertMsg) + system.SlackNotify(AlertMsg, args) elif group == 'mongodb': table.add_row( @@ -211,7 +211,7 @@ def main(): exposed_values=records_mini ) - system.SlackNotify(AlertMsg) + system.SlackNotify(AlertMsg, args) elif group == 'slack': table.add_row( str(i), @@ -239,7 +239,7 @@ def main(): exposed_values=records_mini ) - system.SlackNotify(AlertMsg) + system.SlackNotify(AlertMsg, args) elif group == 'postgresql': table.add_row( str(i), @@ -273,7 +273,7 @@ def main(): exposed_values=records_mini ) - system.SlackNotify(AlertMsg) + system.SlackNotify(AlertMsg, args) elif group == 'redis': table.add_row( @@ -302,7 +302,7 @@ def main(): exposed_values=records_mini ) - system.SlackNotify(AlertMsg) + system.SlackNotify(AlertMsg, args) elif group == 'firebase' or group == 'gcs': table.add_row( str(i), @@ -332,7 +332,7 @@ def main(): exposed_values=records_mini ) - system.SlackNotify(AlertMsg) + system.SlackNotify(AlertMsg, args) elif group == 'fs': table.add_row( @@ -364,7 +364,7 @@ def main(): total_exposed=str(len(result['matches'])), exposed_values=records_mini ) - system.SlackNotify(AlertMsg) + system.SlackNotify(AlertMsg, args) elif group == 'couchdb': table.add_row( str(i), @@ -396,7 +396,7 @@ def main(): exposed_values=records_mini ) - system.SlackNotify(AlertMsg) + system.SlackNotify(AlertMsg, args) elif group == 'gdrive': table.add_row( str(i), @@ -422,7 +422,7 @@ def main(): exposed_values=records_mini ) - system.SlackNotify(AlertMsg) + system.SlackNotify(AlertMsg, args) elif group == 'gdrive_workspace': table.add_row( str(i), @@ -451,7 +451,7 @@ def main(): exposed_values=records_mini ) - system.SlackNotify(AlertMsg) + system.SlackNotify(AlertMsg, args) elif group == 'text': table.add_row( str(i), @@ -474,7 +474,7 @@ def main(): exposed_values=records_mini ) - system.SlackNotify(AlertMsg) + system.SlackNotify(AlertMsg, args) else: # Handle other cases or do nothing for unsupported groups pass diff --git a/readme.md b/readme.md index a308964..3c5cf3d 100644 --- a/readme.md +++ b/readme.md @@ -40,16 +40,34 @@ See how this works on Youtube - https://youtu.be/LuPXE7UJKOY ``` -### Example working command (Use all/fs/s3/gcs etc...) +### Examples +1. Example working command (Use all/fs/s3/gcs etc...) ```bash hawk_scanner all --connection connection.yml --fingerprint fingerprint.yml --json output.json --debug ``` - -### Pass connection data as CLI input in --connection-json flag, and output in json data (Helpful for CI/CD pipeline or automation) +2. Pass connection data as CLI input in --connection-json flag, and output in json data (Helpful for CI/CD pipeline or automation) ```bash hawk_scanner fs --connection-json '{"sources": {"fs": {"fs1": {"quick_scan": true, "path": "/Users/rohitcoder/Downloads/data/KYC_PDF.pdf"}}}}' --stdout --quiet --fingerprint fingerprint.yml ``` +3. You can also import Hawk-eye in your own python scripts and workflows, for better flexibility + ```python + from hawk_scanner.internals import system + pii = system.scan_file("/Users/kumarohit/Downloads/Resume.pdf") + print(pii) + ``` + +4. You can also import Hawk-eye with custom fingerprints in your own python scripts like this +```python + from hawk_scanner.internals import system + pii = system.scan_file("/Users/kumarohit/Downloads/Resume.pdf", { + "fingerprint": { + "Email": '[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}', + } + ) + print(pii) + ``` + ## Platform and arch-specific guidelines ### Postgresql diff --git a/setup.py b/setup.py index 4cbe81a..0bd0ecf 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -VERSION = "0.3.14" +VERSION = "0.3.15" from setuptools import setup, find_packages