Skip to content

Commit

Permalink
Added import support and improved docs
Browse files Browse the repository at this point in the history
  • Loading branch information
rohitcoder committed Jun 30, 2024
1 parent 0dfb307 commit 1302199
Show file tree
Hide file tree
Showing 17 changed files with 273 additions and 255 deletions.
22 changes: 11 additions & 11 deletions hawk_scanner/commands/couchdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@ def connect_couchdb(host, port, username, password, database):
try:
server = couchdb.Server(f"http://{username}:{password}@{host}:{port}/")
if database not in server:
system.print_error(f"Database {database} not found on CouchDB server.")
system.print_error(args, f"Database {database} not found on CouchDB server.")
return None
db = server[database]
system.print_info(f"Connected to CouchDB database")
system.print_info(args, f"Connected to CouchDB database")
return db
except Exception as e:
system.print_error(f"Failed to connect to CouchDB database with error: {e}")
system.print_error(args, f"Failed to connect to CouchDB database with error: {e}")
return None

def check_data_patterns(db, patterns, profile_name, database_name):
Expand All @@ -25,7 +25,7 @@ def check_data_patterns(db, patterns, profile_name, database_name):
for field_name, field_value in document.items():
if field_value:
value_str = str(field_value)
matches = system.match_strings(value_str)
matches = system.match_strings(args, value_str)
if matches:
for match in matches:
results.append({
Expand All @@ -44,15 +44,15 @@ def check_data_patterns(db, patterns, profile_name, database_name):

def execute(args):
results = []
system.print_info(f"Running Checks for CouchDB Sources")
connections = system.get_connection()
system.print_info(args, f"Running Checks for CouchDB Sources")
connections = system.get_connection(args)

if 'sources' in connections:
sources_config = connections['sources']
couchdb_config = sources_config.get('couchdb')

if couchdb_config:
patterns = system.get_fingerprint_file()
patterns = system.get_fingerprint_file(args)

for key, config in couchdb_config.items():
host = config.get('host')
Expand All @@ -62,16 +62,16 @@ def execute(args):
database = config.get('database')

if host and username and password and database:
system.print_info(f"Checking CouchDB Profile {key} with host and authentication")
system.print_info(args, f"Checking CouchDB Profile {key} with host and authentication")
else:
system.print_error(f"Incomplete CouchDB configuration for key: {key}")
system.print_error(args, f"Incomplete CouchDB configuration for key: {key}")
continue

db = connect_couchdb(host, port, username, password, database)
if db:
results += check_data_patterns(db, patterns, key, database)
else:
system.print_error("No CouchDB connection details found in connection.yml")
system.print_error(args, "No CouchDB connection details found in connection.yml")
else:
system.print_error("No 'sources' section found in connection.yml")
system.print_error(args, "No 'sources' section found in connection.yml")
return results
24 changes: 12 additions & 12 deletions hawk_scanner/commands/firebase.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@ def connect_firebase(credentials_file, bucket_name):
cred = credentials.Certificate(credentials_file)
firebase_admin.initialize_app(cred)
bucket = storage.bucket(bucket_name)
system.print_info(f"Connected to Firebase bucket: {bucket_name}")
system.print_info(args, f"Connected to Firebase bucket: {bucket_name}")
return bucket
except Exception as e:
print(f"Failed to connect to Firebase bucket: {e}")

def execute(args):
results = []
shouldDownload = True
connections = system.get_connection()
connections = system.get_connection(args)

if 'sources' in connections:
sources_config = connections['sources']
Expand All @@ -37,7 +37,7 @@ def execute(args):
file_name = blob.name
## get unique etag or hash of file
remote_etag = blob.etag
system.print_debug(f"Remote etag: {remote_etag}")
system.print_debug(args, f"Remote etag: {remote_etag}")

if system.should_exclude_file(file_name, exclude_patterns):
continue
Expand All @@ -49,20 +49,20 @@ def execute(args):
if os.path.exists(file_path):
shouldDownload = False
local_etag = file_path.split('/')[-1].split('-')[0]
system.print_debug(f"Local etag: {local_etag}")
system.print_debug(f"File already exists in cache, using it. You can disable cache by setting 'cache: false' in connection.yml")
system.print_debug(args, f"Local etag: {local_etag}")
system.print_debug(args, f"File already exists in cache, using it. You can disable cache by setting 'cache: false' in connection.yml")
if remote_etag != local_etag:
system.print_debug(f"File in firebase bucket has changed, downloading it again...")
system.print_debug(args, f"File in firebase bucket has changed, downloading it again...")
shouldDownload = True
else:
shouldDownload = False

if shouldDownload:
file_path = f"data/firebase/{remote_etag}-{file_name}"
system.print_debug(f"Downloading file: {file_name} to {file_path}...")
system.print_debug(args, f"Downloading file: {file_name} to {file_path}...")
blob.download_to_filename(file_path)

matches = system.read_match_strings(file_path, 'google_cloud_storage')
matches = system.read_match_strings(args, file_path, 'google_cloud_storage')
if matches:
for match in matches:
results.append({
Expand All @@ -76,13 +76,13 @@ def execute(args):
})

else:
system.print_error(f"Failed to connect to Firebase bucket: {bucket_name}")
system.print_error(args, f"Failed to connect to Firebase bucket: {bucket_name}")
else:
system.print_error(f"Incomplete Firebase configuration for key: {key}")
system.print_error(args, f"Incomplete Firebase configuration for key: {key}")
else:
system.print_error("No Firebase connection details found in connection file")
system.print_error(args, "No Firebase connection details found in connection file")
else:
system.print_error("No 'sources' section found in connection.yml")
system.print_error(args, "No 'sources' section found in connection.yml")

if config.get("cache") == False:
os.system("rm -rf data/firebase")
Expand Down
25 changes: 9 additions & 16 deletions hawk_scanner/commands/fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
import concurrent.futures
import time

def process_file(file_path, key, results):
matches = system.read_match_strings(file_path, 'fs')
def process_file(args, file_path, key, results):
matches = system.read_match_strings(args, file_path, 'fs')
file_data = system.getFileData(file_path)
if matches:
for match in matches:
Expand All @@ -24,18 +24,18 @@ def process_file(file_path, key, results):

def execute(args):
results = []
connections = system.get_connection()
connections = system.get_connection(args)
if 'sources' in connections:
sources_config = connections['sources']
fs_config = sources_config.get('fs')
if fs_config:
for key, config in fs_config.items():
if 'path' not in config:
system.print_error(f"Path not found in fs profile '{key}'")
system.print_error(args, f"Path not found in fs profile '{key}'")
continue
path = config.get('path')
if not os.path.exists(path):
system.print_error(f"Path '{path}' does not exist")
system.print_error(args, f"Path '{path}' does not exist")

exclude_patterns = fs_config.get(key, {}).get('exclude_patterns', [])
start_time = time.time()
Expand All @@ -51,21 +51,14 @@ def execute(args):
futures = []
for file_path in files:
file_count += 1
futures.append(executor.submit(process_file, file_path, key, results))
futures.append(executor.submit(process_file, args, file_path, key, results))

# Wait for all tasks to complete
concurrent.futures.wait(futures)
end_time = time.time()
system.print_info(f"Time taken to analyze {file_count} files: {end_time - start_time} seconds")
system.print_info(args, f"Time taken to analyze {file_count} files: {end_time - start_time} seconds")
else:
system.print_error("No filesystem 'fs' connection details found in connection.yml")
system.print_error(args, "No filesystem 'fs' connection details found in connection.yml")
else:
system.print_error("No 'sources' section found in connection.yml")
system.print_error(args, "No 'sources' section found in connection.yml")
return results

if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Add your command-line arguments here if needed
args = parser.parse_args()
results = execute(args)
# Handle results as needed
24 changes: 12 additions & 12 deletions hawk_scanner/commands/gcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def connect_google_cloud(bucket_name, credentials_file):
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_file
client = storage.Client()
bucket = client.get_bucket(bucket_name)
system.print_debug(f"Connected to Google Cloud Storage bucket: {bucket_name}")
system.print_debug(args, f"Connected to Google Cloud Storage bucket: {bucket_name}")
return bucket
except Exception as e:
print(f"Failed to connect to Google Cloud Storage bucket: {e}")
Expand All @@ -25,7 +25,7 @@ def get_last_update_time(blob):
def execute(args):
results = []
shouldDownload = True
connections = system.get_connection()
connections = system.get_connection(args)

if 'sources' in connections:
sources_config = connections['sources']
Expand All @@ -44,7 +44,7 @@ def execute(args):
file_name = blob.name
## get unique etag or hash of file
remote_etag = get_last_update_time(blob)
system.print_debug(f"Remote etag: {remote_etag}")
system.print_debug(args, f"Remote etag: {remote_etag}")

if system.should_exclude_file(file_name, exclude_patterns):
continue
Expand All @@ -56,19 +56,19 @@ def execute(args):
if os.path.exists(file_path):
shouldDownload = False
local_etag = file_path.split('/')[-1].split('-')[0]
system.print_debug(f"Local etag: {local_etag}")
system.print_debug(f"File already exists in cache, using it. You can disable cache by setting 'cache: false' in connection.yml")
system.print_debug(args, f"Local etag: {local_etag}")
system.print_debug(args, f"File already exists in cache, using it. You can disable cache by setting 'cache: false' in connection.yml")
if remote_etag != local_etag:
system.print_debug(f"File in Google Cloud Storage bucket has changed, downloading it again...")
system.print_debug(args, f"File in Google Cloud Storage bucket has changed, downloading it again...")
shouldDownload = True
else:
shouldDownload = False

if shouldDownload:
system.print_debug(f"Downloading file: {file_name} to {file_path}...")
system.print_debug(args, f"Downloading file: {file_name} to {file_path}...")
blob.download_to_filename(file_path)

matches = system.read_match_strings(file_path, 'google_cloud_storage')
matches = system.read_match_strings(args, file_path, 'google_cloud_storage')
if matches:
for match in matches:
results.append({
Expand All @@ -81,13 +81,13 @@ def execute(args):
'data_source': 'gcs'
})
else:
system.print_error(f"Failed to connect to Google Cloud Storage bucket: {bucket_name}")
system.print_error(args, f"Failed to connect to Google Cloud Storage bucket: {bucket_name}")
else:
system.print_error(f"Incomplete Google Cloud Storage configuration for key: {key}")
system.print_error(args, f"Incomplete Google Cloud Storage configuration for key: {key}")
else:
system.print_error("No Google Cloud Storage connection details found in connection.yml")
system.print_error(args, "No Google Cloud Storage connection details found in connection.yml")
else:
system.print_error("No 'sources' section found in connection.yml")
system.print_error(args, "No 'sources' section found in connection.yml")
if config.get("cache") == False:
os.system("rm -rf data/google_cloud_storage")
return results
16 changes: 8 additions & 8 deletions hawk_scanner/commands/gdrive.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def connect_google_drive(credentials_file):

try:
fs = GDriveFileSystem("root", client_id=client_id, client_secret=client_secret, token=credentials_file)
system.print_debug("Connected to Google Drive")
system.print_debug(args, "Connected to Google Drive")
drive = fs.client
return drive
except Exception as e:
Expand Down Expand Up @@ -53,7 +53,7 @@ def download_file(drive, file_obj, base_path):
else:
file_obj.GetContentFile(file_path)

system.print_debug(f"File downloaded to: {file_path}")
system.print_debug(args, f"File downloaded to: {file_path}")
except Exception as e:
print(f"Failed to download file: {e}")

Expand All @@ -68,15 +68,15 @@ def list_files(drive, folder_name=None):
def execute(args):
results = []
should_download = True
connections = system.get_connection()
connections = system.get_connection(args)
is_cache_enabled = False
drive_config = None

if 'sources' in connections:
sources_config = connections['sources']
drive_config = sources_config.get('gdrive')
else:
system.print_error("No 'sources' section found in connection.yml")
system.print_error(args, "No 'sources' section found in connection.yml")

if drive_config:
for key, config in drive_config.items():
Expand Down Expand Up @@ -113,14 +113,14 @@ def execute(args):

if config.get("cache") and os.path.exists(file_path):
should_download = False
system.print_debug(f"File already exists in cache, using it.")
system.print_debug(args, f"File already exists in cache, using it.")
else:
should_download = True

if should_download:
download_file(drive, file_obj, "data/google_drive")

matches = system.read_match_strings(file_path, 'gdrive')
matches = system.read_match_strings(args, file_path, 'gdrive')
if matches:
for match in matches:
results.append({
Expand All @@ -134,9 +134,9 @@ def execute(args):
'data_source': 'gdrive'
})
else:
system.print_error("Failed to connect to Google Drive")
system.print_error(args, "Failed to connect to Google Drive")
else:
system.print_error("No Google Drive connection details found in connection file")
system.print_error(args, "No Google Drive connection details found in connection file")

if not is_cache_enabled:
os.system("rm -rf data/google_drive")
Expand Down
14 changes: 7 additions & 7 deletions hawk_scanner/commands/gdrive_workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def download_file(drive, file_obj, base_path):
except Exception as e:
print(f"Failed to write file: {e}")

system.print_debug(f"File downloaded to: {folder_path}")
system.print_debug(args, f"File downloaded to: {folder_path}")
except Exception as e:
print(f"Failed to download file: {e}")

Expand All @@ -83,14 +83,14 @@ def list_files(drive, impersonate_user=None):

def execute(args):
results = []
connections = system.get_connection()
connections = system.get_connection(args)
is_cache_enabled = False

if 'sources' in connections:
sources_config = connections['sources']
drive_config = sources_config.get('gdrive_workspace')
else:
system.print_error("No 'sources' section found in connection.yml")
system.print_error(args, "No 'sources' section found in connection.yml")

if drive_config:
for key, config in drive_config.items():
Expand Down Expand Up @@ -121,14 +121,14 @@ def execute(args):

if config.get("cache") and os.path.exists(file_path):
is_cache_enabled = False
system.print_debug(f"File already exists in cache, using it.")
system.print_debug(args, f"File already exists in cache, using it.")
else:
is_cache_enabled = True

if is_cache_enabled:
download_file(drive, file_obj, "data/google_drive/")

matches = system.read_match_strings(file_path, 'gdrive_workspace')
matches = system.read_match_strings(args, file_path, 'gdrive_workspace')
file_name = file_name.replace('-runtime.pdf', '')
if matches:
for match in matches:
Expand All @@ -144,9 +144,9 @@ def execute(args):
'data_source': 'gdrive_workspace'
})
else:
system.print_error("Failed to connect to Google Drive")
system.print_error(args, "Failed to connect to Google Drive")
else:
system.print_error("No Google Drive connection details found in connection file")
system.print_error(args, "No Google Drive connection details found in connection file")

"""if not is_cache_enabled:
os.system("rm -rf data/google_drive")"""
Expand Down
Loading

0 comments on commit 1302199

Please sign in to comment.