From 9a16540c2a65092a8c8b9ded7678a32c1e651f0f Mon Sep 17 00:00:00 2001 From: psyray Date: Sat, 13 Jul 2024 18:40:44 +0200 Subject: [PATCH 1/3] fix(fetch_url): fix unwanted subdomain and rework method --- README.md | 1 + default_yaml_config.yaml | 1 + web/fixtures/default_scan_engines.yaml | 2 +- web/reNgine/common_func.py | 7 + web/reNgine/tasks.py | 195 +++++++++++------- .../templates/scanEngine/add_engine.html | 1 + 6 files changed, 128 insertions(+), 79 deletions(-) diff --git a/README.md b/README.md index 268bc91a7..28ce3bbff 100644 --- a/README.md +++ b/README.md @@ -256,6 +256,7 @@ fetch_url: { ], 'remove_duplicate_endpoints': true, 'duplicate_fields': ['content_length', 'page_title'], + 'follow_redirect': false, 'enable_http_crawl': true, 'gf_patterns': ['debug_logic', 'idor', 'interestingEXT', 'interestingparams', 'interestingsubs', 'lfi', 'rce', 'redirect', 'sqli', 'ssrf', 'ssti', 'xss'], 'ignore_file_extensions': ['png', 'jpg', 'jpeg', 'gif', 'mp4', 'mpeg', 'mp3'], diff --git a/default_yaml_config.yaml b/default_yaml_config.yaml index 54c3f17bb..058d7c61d 100644 --- a/default_yaml_config.yaml +++ b/default_yaml_config.yaml @@ -100,6 +100,7 @@ fetch_url: { 'uses_tools': ['gospider', 'hakrawler', 'waybackurls', 'katana', 'gau'], 'remove_duplicate_endpoints': true, 'duplicate_fields': ['content_length', 'page_title'], + 'follow_redirect': false, 'enable_http_crawl': true, 'gf_patterns': ['debug_logic', 'idor', 'interestingEXT', 'interestingparams', 'interestingsubs', 'lfi', 'rce', 'redirect', 'sqli', 'ssrf', 'ssti', 'xss'], 'ignore_file_extensions': ['png', 'jpg', 'jpeg', 'gif', 'mp4', 'mpeg', 'mp3'], diff --git a/web/fixtures/default_scan_engines.yaml b/web/fixtures/default_scan_engines.yaml index 27b95d1ce..8c3e65cf8 100644 --- a/web/fixtures/default_scan_engines.yaml +++ b/web/fixtures/default_scan_engines.yaml @@ -23,7 +23,7 @@ 5,\r\n 'threads': 30,\r\n 'wordlist_name': 'dicc'\r\n}\r\nfetch_url: {\r\n \ 'uses_tools': ['gospider', 'hakrawler', 'waybackurls', 'katana', 'gau'],\r\n \ 'remove_duplicate_endpoints': true,\r\n 'duplicate_fields': ['content_length', - 'page_title'],\r\n 'enable_http_crawl': true,\r\n 'gf_patterns': ['debug_logic', + 'page_title'],\r\n 'follow_redirect': false,\r\n 'enable_http_crawl': true,\r\n 'gf_patterns': ['debug_logic', 'idor', 'interestingEXT', 'interestingparams', 'interestingsubs', 'lfi', 'rce', 'redirect', 'sqli', 'ssrf', 'ssti', 'xss'],\r\n 'ignore_file_extensions': ['png', 'jpg', 'jpeg', 'gif', 'mp4', 'mpeg', 'mp3'],\r\n 'threads': 30\r\n}\r\nvulnerability_scan: {\r\n diff --git a/web/reNgine/common_func.py b/web/reNgine/common_func.py index 0a3150c22..9ba036c17 100644 --- a/web/reNgine/common_func.py +++ b/web/reNgine/common_func.py @@ -309,19 +309,26 @@ def get_http_urls( query = EndPoint.objects if domain: + logger.debug(f'Searching URLs by domain {domain}') query = query.filter(target_domain=domain) if scan: + logger.debug(f'Searching URLs by scan {scan}') query = query.filter(scan_history=scan) if subdomain_id: + subdomain = Subdomain.objects.filter(pk=subdomain_id).first() + logger.debug(f'Searching URLs by subdomain {subdomain}') query = query.filter(subdomain__id=subdomain_id) elif exclude_subdomains and domain: + logger.debug(f'Excluding subdomains') query = query.filter(http_url=domain.http_url) if get_only_default_urls: + logger.debug(f'Searching only for default URL') query = query.filter(is_default=True) # If is_uncrawled is True, select only endpoints that have not been crawled # yet (no status) if is_uncrawled: + logger.debug(f'Searching for uncrawled endpoints only') query = query.filter(http_status__isnull=True) # If a path is passed, select only endpoints that contains it diff --git a/web/reNgine/tasks.py b/web/reNgine/tasks.py index 24aa47d30..140bd0698 100644 --- a/web/reNgine/tasks.py +++ b/web/reNgine/tasks.py @@ -1771,11 +1771,12 @@ def fetch_url(self, urls=[], ctx={}, description=None): threads = config.get(THREADS) or self.yaml_configuration.get(THREADS, DEFAULT_THREADS) domain_request_headers = self.domain.request_headers if self.domain else None custom_header = config.get(CUSTOM_HEADER) or self.yaml_configuration.get(CUSTOM_HEADER) + follow_redirect = config.get(FOLLOW_REDIRECT, False) # Get follow redirect setting if domain_request_headers or custom_header: custom_header = domain_request_headers or custom_header exclude_subdomains = config.get(EXCLUDED_SUBDOMAINS, False) - # Get URLs to scan and save to input file + # Initialize the URLs if urls and is_iterable(urls): with open(input_path, 'w') as f: f.write('\n'.join(urls)) @@ -1788,16 +1789,15 @@ def fetch_url(self, urls=[], ctx={}, description=None): ctx=ctx ) - # Domain regex - host = self.domain.name if self.domain else urlparse(urls[0]).netloc - host_regex = f"\'https?://([a-z0-9]+[.])*{host}.*\'" + # Log initial URLs + logger.debug(f'Initial URLs: {urls}') - # Tools cmds + # Initialize command map for tools cmd_map = { 'gau': f'gau', 'hakrawler': 'hakrawler -subs -u', 'waybackurls': 'waybackurls', - 'gospider': f'gospider --js -d 2 --sitemap --robots -w -r', + 'gospider': f'gospider --js -d 2 --sitemap --robots -w -r -a', 'katana': f'katana -silent -jc -kf all -d 3 -fs rdn', } if proxy: @@ -1808,26 +1808,47 @@ def fetch_url(self, urls=[], ctx={}, description=None): if threads > 0: cmd_map['gau'] += f' --threads {threads}' cmd_map['gospider'] += f' -t {threads}' + cmd_map['hakrawler'] += f' -t {threads}' cmd_map['katana'] += f' -c {threads}' if custom_header: cmd_map['gospider'] += generate_header_param(custom_header, 'gospider') cmd_map['hakrawler'] += generate_header_param(custom_header, 'hakrawler') cmd_map['katana'] += generate_header_param(custom_header, 'common') - cat_input = f'cat {input_path}' - grep_output = f'grep -Eo {host_regex}' - cmd_map = { - tool: f'{cat_input} | {cmd} | {grep_output} > {self.results_dir}/urls_{tool}.txt' - for tool, cmd in cmd_map.items() - } - tasks = group( - run_command.si( - cmd, - shell=True, - scan_id=self.scan_id, - activity_id=self.activity_id) - for tool, cmd in cmd_map.items() - if tool in tools - ) + + # Add follow_redirect option to tools that support it + if follow_redirect is False: + cmd_map['gospider'] += f' --no-redirect' + cmd_map['hakrawler'] += f' -dr' + cmd_map['katana'] += f' -dr' + + tasks = [] + + # Iterate over each URL and generate commands for each tool + for url in urls: + parsed_url = urlparse(url) + base_domain = parsed_url.netloc.split(':')[0] # Remove port if present + host_regex = f"'https?://{re.escape(base_domain)}(:[0-9]+)?(/.*)?$'" + + # Log the generated regex for the current URL + logger.debug(f'Generated regex for domain {base_domain}: {host_regex}') + + cat_input = f'echo "{url}"' + + # Generate commands for each tool for the current URL + for tool in tools: # Only use tools specified in the config + if tool in cmd_map: + cmd = cmd_map[tool] + tool_cmd = f'{cat_input} | {cmd} | grep -Eo {host_regex} > {self.results_dir}/urls_{tool}_{base_domain}.txt' + tasks.append(run_command.si( + tool_cmd, + shell=True, + scan_id=self.scan_id, + activity_id=self.activity_id) + ) + logger.debug(f'Generated command for tool {tool}: {tool_cmd}') + + # Group the tasks + task_group = group(tasks) # Cleanup task sort_output = [ @@ -1852,41 +1873,51 @@ def fetch_url(self, urls=[], ctx={}, description=None): ) # Run all commands - task = chord(tasks)(cleanup) + task = chord(task_group)(cleanup) with allow_join_result(): task.get() # Store all the endpoints and run httpx - with open(self.output_path) as f: - discovered_urls = f.readlines() - self.notify(fields={'Discovered URLs': len(discovered_urls)}) - - # Some tools can have an URL in the format ] - or - , add them - # to the final URL list all_urls = [] - for url in discovered_urls: - url = url.strip() - urlpath = None - base_url = None - if '] ' in url: # found JS scraped endpoint e.g from gospider - split = tuple(url.split('] ')) - if not len(split) == 2: - logger.warning(f'URL format not recognized for "{url}". Skipping.') - continue - base_url, urlpath = split - urlpath = urlpath.lstrip('- ') - elif ' - ' in url: # found JS scraped endpoint e.g from gospider - base_url, urlpath = tuple(url.split(' - ')) - - if base_url and urlpath: - subdomain = urlparse(base_url) - url = f'{subdomain.scheme}://{subdomain.netloc}{self.url_filter}' - - if not validators.url(url): - logger.warning(f'Invalid URL "{url}". Skipping.') - - if url not in all_urls: - all_urls.append(url) + tool_mapping = {} # New dictionary to map URLs to tools + for tool in tools: + for url in urls: + parsed_url = urlparse(url) + base_domain = parsed_url.netloc.split(':')[0] # Remove port if present + tool_output_file = f'{self.results_dir}/urls_{tool}_{base_domain}.txt' + if os.path.exists(tool_output_file): + with open(tool_output_file, 'r') as f: + discovered_urls = f.readlines() + for url in discovered_urls: + url = url.strip() + urlpath = None + base_url = None + if '] ' in url: # found JS scraped endpoint e.g from gospider + split = tuple(url.split('] ')) + if not len(split) == 2: + logger.warning(f'URL format not recognized for "{url}". Skipping.') + continue + base_url, urlpath = split + urlpath = urlpath.lstrip('- ') + elif ' - ' in url: # found JS scraped endpoint e.g from gospider + base_url, urlpath = tuple(url.split(' - ')) + + if base_url and urlpath: + subdomain = urlparse(base_url) + url = f'{subdomain.scheme}://{subdomain.netloc}{urlpath}' + + if not validators.url(url): + logger.warning(f'Invalid URL "{url}". Skipping.') + continue + + if url not in tool_mapping: + tool_mapping[url] = set() + tool_mapping[url].add(tool) # Use a set to ensure uniqueness + + all_urls = list(tool_mapping.keys()) + for url, found_tools in tool_mapping.items(): + unique_tools = ', '.join(found_tools) + logger.info(f'URL {url} found by tools: {unique_tools}') # Filter out URLs if a path filter was passed if self.url_filter: @@ -1907,7 +1938,6 @@ def fetch_url(self, urls=[], ctx={}, description=None): duplicate_removal_fields=duplicate_removal_fields ) - #-------------------# # GF PATTERNS MATCH # #-------------------# @@ -1965,11 +1995,12 @@ def fetch_url(self, urls=[], ctx={}, description=None): earlier_pattern = endpoint.matched_gf_patterns pattern = f'{earlier_pattern},{gf_pattern}' if earlier_pattern else gf_pattern endpoint.matched_gf_patterns = pattern + # TODO Add tool that found the URL to the db (need to update db model) + # endpoint.found_by_tools = ','.join(tool_mapping.get(url, [])) # Save tools in the endpoint endpoint.save() return all_urls - def parse_curl_output(response): # TODO: Enrich from other cURL fields. CURL_REGEX_HTTP_STATUS = f'HTTP\/(?:(?:\d\.?)+)\s(\d+)\s(?:\w+)' @@ -1985,7 +2016,6 @@ def parse_curl_output(response): 'http_status': http_status, } - @app.task(name='vulnerability_scan', queue='main_scan_queue', bind=True, base=RengineTask) def vulnerability_scan(self, urls=[], ctx={}, description=None): """ @@ -4052,7 +4082,8 @@ def remove_duplicate_endpoints( domain_id, subdomain_id=None, filter_ids=[], - filter_status=[200, 301, 404], + # TODO Check if the status code could be set as parameters of the scan engine instead of hardcoded values + filter_status=[200, 301, 302, 303, 307, 404, 410], # Extended status codes duplicate_removal_fields=ENDPOINT_SCAN_DEFAULT_DUPLICATE_FIELDS ): """Remove duplicate endpoints. @@ -4071,6 +4102,8 @@ def remove_duplicate_endpoints( duplicate_removal_fields (list): List of Endpoint model fields to check for duplicates """ logger.info(f'Removing duplicate endpoints based on {duplicate_removal_fields}') + + # Filter endpoints based on scan history and domain endpoints = ( EndPoint.objects .filter(scan_history__id=scan_history_id) @@ -4085,29 +4118,35 @@ def remove_duplicate_endpoints( if filter_ids: endpoints = endpoints.filter(id__in=filter_ids) - for field_name in duplicate_removal_fields: - cl_query = ( - endpoints - .values_list(field_name) - .annotate(mc=Count(field_name)) - .order_by('-mc') - ) - for (field_value, count) in cl_query: - if count > DELETE_DUPLICATES_THRESHOLD: - eps_to_delete = ( - endpoints - .filter(**{field_name: field_value}) - .order_by('discovered_date') - .all()[1:] - ) - msg = f'Deleting {len(eps_to_delete)} endpoints [reason: same {field_name} {field_value}]' - for ep in eps_to_delete: - url = urlparse(ep.http_url) - if url.path in ['', '/', '/login']: # try do not delete the original page that other pages redirect to - continue - msg += f'\n\t {ep.http_url} [{ep.http_status}] [{field_name}={field_value}]' - ep.delete() - logger.warning(msg) + # Group by all duplicate removal fields combined + fields_combined = duplicate_removal_fields[:] + fields_combined.append('id') # Add ID to ensure unique identification + + cl_query = ( + endpoints + .values(*duplicate_removal_fields) + .annotate(mc=Count('id')) + .order_by('-mc') + ) + + for field_values in cl_query: + if field_values['mc'] > DELETE_DUPLICATES_THRESHOLD: + filter_criteria = {field: field_values[field] for field in duplicate_removal_fields} + eps_to_delete = ( + endpoints + .filter(**filter_criteria) + .order_by('discovered_date') + .all()[1:] + ) + msg = f'Deleting {len(eps_to_delete)} endpoints [reason: same {filter_criteria}]' + for ep in eps_to_delete: + url = urlparse(ep.http_url) + if url.path in ['', '/', '/login']: # Ensure not to delete the original page that other pages redirect to + continue + msg += f'\n\t {ep.http_url} [{ep.http_status}] {filter_criteria}' + ep.delete() + logger.warning(msg) + @app.task(name='run_command', bind=False, queue='run_command_queue') def run_command( diff --git a/web/scanEngine/templates/scanEngine/add_engine.html b/web/scanEngine/templates/scanEngine/add_engine.html index 46179a492..7107c33eb 100644 --- a/web/scanEngine/templates/scanEngine/add_engine.html +++ b/web/scanEngine/templates/scanEngine/add_engine.html @@ -148,6 +148,7 @@

Scan Engines

'content_length', 'page_title' ], + 'follow_redirect': false, 'enable_http_crawl': true, 'gf_patterns': ['debug_logic', 'idor', 'interestingEXT', 'interestingparams', 'interestingsubs', 'lfi', 'rce', 'redirect', 'sqli', 'ssrf', 'ssti', 'xss'], 'ignore_file_extensions': ['png', 'jpg', 'jpeg', 'gif', 'mp4', 'mpeg', 'mp3'], From 60d23dbba0e7e800d3b238d2bf5802ea4d56f235 Mon Sep 17 00:00:00 2001 From: psyray Date: Sat, 13 Jul 2024 18:41:11 +0200 Subject: [PATCH 2/3] fix(log): improve debug log while in debug env --- web/reNgine/settings.py | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/web/reNgine/settings.py b/web/reNgine/settings.py index 3c30fd712..03c2b7e56 100644 --- a/web/reNgine/settings.py +++ b/web/reNgine/settings.py @@ -260,11 +260,6 @@ 'level': 'ERROR' if UI_DEBUG else 'CRITICAL', 'propagate': True, }, - '': { - 'handlers': ['brief'], - 'level': 'DEBUG' if UI_DEBUG else 'INFO', - 'propagate': False - }, 'celery.app.trace': { 'handlers': ['null'], 'propagate': False, @@ -286,12 +281,32 @@ 'level': 'INFO', 'propagate': False }, - 'reNgine.tasks': { + 'reNgine': { 'handlers': ['task'], 'level': 'DEBUG' if CELERY_DEBUG else 'INFO', - 'propagate': False - } + 'propagate': True # Allow log messages to propagate to root logger + }, + 'kombu.pidbox': { + 'handlers': ['null'], + 'propagate': False, + }, + 'celery.pool': { + 'handlers': ['null'], + 'propagate': False, + }, + 'celery.bootsteps': { + 'handlers': ['null'], + 'propagate': False, + }, + 'celery.utils.functional': { + 'handlers': ['null'], + 'propagate': False, + }, }, + 'root': { + 'handlers': ['console'], + 'level': 'DEBUG' if CELERY_DEBUG else 'INFO', + } } # debug From b26bedfd3f676895b7a1fba6925f8f7726edc99e Mon Sep 17 00:00:00 2001 From: Psyray Date: Sat, 3 Aug 2024 17:50:57 +0200 Subject: [PATCH 3/3] Apply suggestions from code review Co-authored-by: Anonymoussaurus <50231698+AnonymousWP@users.noreply.github.com> --- web/fixtures/default_scan_engines.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/fixtures/default_scan_engines.yaml b/web/fixtures/default_scan_engines.yaml index 8c3e65cf8..f76ab3b49 100644 --- a/web/fixtures/default_scan_engines.yaml +++ b/web/fixtures/default_scan_engines.yaml @@ -23,7 +23,7 @@ 5,\r\n 'threads': 30,\r\n 'wordlist_name': 'dicc'\r\n}\r\nfetch_url: {\r\n \ 'uses_tools': ['gospider', 'hakrawler', 'waybackurls', 'katana', 'gau'],\r\n \ 'remove_duplicate_endpoints': true,\r\n 'duplicate_fields': ['content_length', - 'page_title'],\r\n 'follow_redirect': false,\r\n 'enable_http_crawl': true,\r\n 'gf_patterns': ['debug_logic', + 'page_title'],\r\n 'follow_redirect': false,\r\n 'enable_http_crawl': true,\r\n 'gf_patterns': ['debug_logic', 'idor', 'interestingEXT', 'interestingparams', 'interestingsubs', 'lfi', 'rce', 'redirect', 'sqli', 'ssrf', 'ssti', 'xss'],\r\n 'ignore_file_extensions': ['png', 'jpg', 'jpeg', 'gif', 'mp4', 'mpeg', 'mp3'],\r\n 'threads': 30\r\n}\r\nvulnerability_scan: {\r\n