From aa8d2fe6c0082777c8b4a1bdb73dad440fd44ee6 Mon Sep 17 00:00:00 2001 From: I748376 Date: Fri, 30 Aug 2024 12:26:45 +0000 Subject: [PATCH] [IMP] cleans up code and removes all unused or unimportant files --- prospector/evaluation/analyse.py | 8 +- prospector/evaluation/analyse_statistics.py | 10 -- prospector/evaluation/compare.py | 2 +- prospector/evaluation/compare_reports.py | 133 ---------------- prospector/evaluation/create_jobs.py | 160 -------------------- prospector/evaluation/dispatch_jobs.py | 2 +- prospector/evaluation/extract_errors.py | 26 ---- prospector/rules/rules.py | 8 +- 8 files changed, 9 insertions(+), 340 deletions(-) delete mode 100644 prospector/evaluation/compare_reports.py delete mode 100644 prospector/evaluation/create_jobs.py delete mode 100644 prospector/evaluation/extract_errors.py diff --git a/prospector/evaluation/analyse.py b/prospector/evaluation/analyse.py index c1db5590e..546273e25 100644 --- a/prospector/evaluation/analyse.py +++ b/prospector/evaluation/analyse.py @@ -80,7 +80,7 @@ def analyse_prospector_reports(filename: str, selected_cves: str): # Keep track of the CVEs where there is no report file reports_not_found = [] - #### Data to insert into table + # Data to insert into table if BATCH in ["regular", "old_code"]: results = { "high": [], @@ -620,7 +620,7 @@ def generate_checkmarks_table(input_dataset: str, selected_cves): rule_checks = {rule: "" for rule in all_rules} for r in matched_rules: - rule_checks[r] = "\checkmark" + rule_checks[r] = "\checkmark" # noqa: W605 row.extend([rule_checks[r] for r in all_rules]) row.extend([str(overall_exectime), str(llm_exectime)]) @@ -785,9 +785,7 @@ def generate_sankey_diagram(file1: str, file2: str, file3: str): height=800, ) - output_file = ( - ANALYSIS_RESULTS_PATH + f"sankey-{file1}-{file2}-{file3}.png" - ) + output_file = ANALYSIS_RESULTS_PATH + f"sankey-{file1}-{file2}-{file3}.png" # Save as PNG write_image(fig, output_file) print(f"Sankey diagram saved to {output_file}") diff --git a/prospector/evaluation/analyse_statistics.py b/prospector/evaluation/analyse_statistics.py index ab8181c4f..edfc566cd 100644 --- a/prospector/evaluation/analyse_statistics.py +++ b/prospector/evaluation/analyse_statistics.py @@ -63,16 +63,6 @@ def analyse_statistics(filename: str): # noqa: C901 avg_cc_time = sum(cc_times) / len(cc_times) avg_total_cc_time = sum(total_cc_times) / len(total_cc_times) - # How many commits was the commit classification rule applied to? - for itm in dataset: - filepath = PROSPECTOR_REPORTS_PATH_HOST + filename + f"/{itm[0]}.json" - try: - cc_num_commits = _get_cc_num_commits(filepath) - break - - except FileNotFoundError: - continue - execution_data = { "timestamp": datetime.now().strftime("%H:%M:%S"), "total_files_found": len(repo_times), diff --git a/prospector/evaluation/compare.py b/prospector/evaluation/compare.py index 76da0a928..c87f11c31 100644 --- a/prospector/evaluation/compare.py +++ b/prospector/evaluation/compare.py @@ -91,7 +91,7 @@ def main(): file = "evaluation/data/input/d63.csv" dataset = load_dataset(file) - ## Things to measure + # Things to measure counterpart_exists = [] missing_in_directory1 = [] missing_in_directory2 = [] diff --git a/prospector/evaluation/compare_reports.py b/prospector/evaluation/compare_reports.py deleted file mode 100644 index c27a0e196..000000000 --- a/prospector/evaluation/compare_reports.py +++ /dev/null @@ -1,133 +0,0 @@ -# This script compares the reports of the same CVEs for two different batches -# of reports. It uses the flow-analysis.json file generated by the analysis.py -# file to have a list of CVEs that are classified differently in both batches. - - -from collections import Counter -from evaluation.utils import ( - ANALYSIS_RESULTS_PATH, - logger, - config, - load_json_file, -) - - -def process_cve(cve, from_category, to_category): - is_diff_order = False - is_same_rules = False - # Find Matteo's code report and my report - try: - matteo_report = load_json_file( - f"../../../data/prospector_reports/reports_now_with_matteos_code/{cve}.json" - ) - my_report = load_json_file( - f"../../../data/prospector_reports/reports_without_llm_mvi/{cve}.json" - ) - - except Exception as e: - # print(f"Couldn't open a report: {e}") - pass - - # Get lists of the candidates - matteo_candidate_list = [ - commit["commit_id"] for commit in matteo_report["commits"] - ] - - my_candidate_list = [commit["commit_id"] for commit in my_report["commits"]] - - if _same_elements(matteo_candidate_list, my_candidate_list): - print(f"Processing: {cve}, from {from_category} to {to_category}") - print(f"Same ranked candidates for {cve}") - # Are they also ordered the same? - if matteo_candidate_list != my_candidate_list: - print(f"Same candidates, but ranked differently!") - - is_diff_order = True - print("---") - - # They are not the same candidates, the reports found different candidates - else: - # Do the first 10 candidates match the same rules? - matteo_relevance_scores = [ - sum([rule["relevance"] for rule in commit["matched_rules"]]) - for commit in matteo_report["commits"][:10] - ] - my_relevance_scores = [ - sum([rule["relevance"] for rule in commit["matched_rules"]]) - for commit in my_report["commits"][:10] - ] - if matteo_relevance_scores == my_relevance_scores: - print(f"Processing: {cve}, from {from_category} to {to_category}") - print( - f"First ten candidates have equal relevances for {cve}: {my_relevance_scores}" - ) - # print(f"Candidates Matteo: {matteo_candidate_list[:10]}") - is_same_rules = True - print("---") - # print(f"Candidates Me: {my_candidate_list[:10]}") - - else: - num_same, list_different = _count_same_elements( - matteo_candidate_list, my_candidate_list - ) - # print(f"{num_same} candidates are the same: {list_different}") - # print(f"{num_same} candidates are the same.") - - return is_diff_order, is_same_rules - - -def _same_elements(list1: list, list2: list): - set1 = set(list1) - set2 = set(list2) - - # Check if one set is a subset of the other - return set1.issubset(set2) or set2.issubset(set1) - - -def _count_same_elements(list1, list2): - num_different = len(set(list1) & set(list2)) - - min_length = min(len(list1), len(list2)) - - result = [] - for i in range(min_length): - if list1[i] == list2[i]: - result.append("S") - else: - result.append("D") - - return num_different, result - - -def main(): - # Get all the different CVEs from the flow analysis - flow_analysis_data = load_json_file( - "evaluation/data/results/summary_execution/flow-analysis.json" - ) - different_candidate_order = [] - different_candidates_matching_same_rules = [] - # Iterate through these CVEs - for outer_key, outer_value in flow_analysis_data.items(): - for inner_key, cve_list in outer_value.items(): - for cve in cve_list: - try: - is_diff_order, is_same_rules = process_cve( - cve, outer_key, inner_key - ) - if is_diff_order: - different_candidate_order.append(cve) - if is_same_rules: - different_candidates_matching_same_rules.append(cve) - except: - continue - - print( - f"Same candidates, but differently ordered: {different_candidate_order}" - ) - print( - f"Different candidates, but equivalent relevance score in first 10 candidates: {different_candidate_order}" - ) - - -if __name__ == "__main__": - main() diff --git a/prospector/evaluation/create_jobs.py b/prospector/evaluation/create_jobs.py deleted file mode 100644 index e036133f2..000000000 --- a/prospector/evaluation/create_jobs.py +++ /dev/null @@ -1,160 +0,0 @@ -import json -import sys -import time -from datetime import datetime - -import redis -import requests -from rq import Connection, Queue, get_current_job - -from backenddb.postgres import PostgresBackendDB -from core.prospector import prospector -from core.report import generate_report -from llm.llm_service import LLMService -from log.logger import logger -from util.config_parser import parse_config_file - -from evaluation.utils import ( - PROSPECTOR_REPORTS_PATH_CONTAINER, - logger, - config, -) - -prospector_config = config.prospector_settings - - -async def enqueue_jobs(): - db = connect_to_db() - processed_vulns = db.get_processed_vulns_not_in_job() - print(processed_vulns) - created_by = "Auto" - for processed_vuln in processed_vulns: - pv_id = processed_vuln["_id"] - pv_repository = processed_vuln["repository"] - pv_versions = processed_vuln["versions"] - v_vuln_id = processed_vuln["vuln_id"] - - try: - job = _create_prospector_job(v_vuln_id, pv_repository, pv_versions) - except Exception: - logger.error( - "error while creating automatically the jobs", exc_info=True - ) - - try: - db.save_job( - job.get_id(), - pv_id, - job.args, - job.created_at, - job.started_at, - job.ended_at, - job.result, - created_by, - job.get_status(refresh=True), - ) - except Exception: - logger.error( - "error while saving automatically the jobs", exc_info=True - ) - - db.disconnect() - - -def _create_prospector_job(vuln_id, repo, version, at_front=False): - with Connection(redis.from_url(prospector_config.redis_url)): - queue = Queue(default_timeout=800) - if at_front: - job = queue.enqueue( - _run_prospector_and_generate_report, - args=(vuln_id, repo, version), - at_front=True, - ) - else: - job = queue.enqueue( - _run_prospector_and_generate_report, - args=(vuln_id, repo, version), - ) - - return job - - -def _run_prospector_and_generate_report(vuln_id, repo_url, v_int): - job = get_current_job() - job_id = job.get_id() - url = f"{prospector_config.backend}/jobs/{job_id}" - data = { - "status": job.get_status(), - "started_at": job.started_at.isoformat(), - } - - try: - response = requests.put(url, json=data) - if response.status_code == 200: - response_object = response.json() - print(response_object) - else: - print("Error:", response.status_code) - except requests.exceptions.RequestException as e: - print("Error:", e) - - params = { - "vulnerability_id": vuln_id, - "repository_url": repo_url, - "version_interval": v_int, - "use_backend": True, - "backend_address": prospector_config.backend, - "git_cache": "/tmp/gitcache", - "limit_candidates": 2000, - "use_llm_repository_url": False, - "enabled_rules": prospector_config.enabled_rules, - } - - try: - LLMService(prospector_config.llm_service) - except Exception as e: - logger.error(f"LLM Service could not be instantiated: {e}") - raise e - - try: - results, advisory_record = prospector(**params) - generate_report( - results, - advisory_record, - "json", - f"{PROSPECTOR_REPORTS_PATH_CONTAINER}{vuln_id}.json", - prospector_params=params, - ) - status = "finished" - results = f"data_sources/reports/{vuln_id}_{job_id}" - except Exception as e: - status = "failed" - results = None - logger.error(f"job failed during execution: {e}") - finally: - end_time = datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f") - print(job_id, status, end_time, results) - data = {"status": status, "finished_at": end_time, "results": results} - try: - response = requests.put(url, json=data) - if response.status_code == 200: - response_object = response.json() - print(response_object) - else: - print("Error:", response.status_code) - except requests.exceptions.RequestException as e: - print("Error:", e) - - return f"data_sources/reports/{vuln_id}_{job_id}" - - -def connect_to_db(): - db = PostgresBackendDB( - prospector_config.database.user, - prospector_config.database.password, - prospector_config.database.host, - prospector_config.database.port, - prospector_config.database.dbname, - ) - db.connect() - return db diff --git a/prospector/evaluation/dispatch_jobs.py b/prospector/evaluation/dispatch_jobs.py index 6186c4e8e..239cfdd00 100644 --- a/prospector/evaluation/dispatch_jobs.py +++ b/prospector/evaluation/dispatch_jobs.py @@ -101,7 +101,7 @@ def _run_prospector_and_generate_report( logger.error(f"prospector() crashed at {cve_id}: {e}") raise e - logger.info(f"prospector() returned. Generating report now.") + logger.info("prospector() returned. Generating report now.") try: generate_report( diff --git a/prospector/evaluation/extract_errors.py b/prospector/evaluation/extract_errors.py deleted file mode 100644 index 545ab0e41..000000000 --- a/prospector/evaluation/extract_errors.py +++ /dev/null @@ -1,26 +0,0 @@ -import re - -from evaluation.utils import ( - INPUT_DATA_PATH, - ANALYSIS_RESULTS_PATH, - load_dataset, -) - - -def extract_crash_lines(log_file_path, output_file_path): - crash_pattern = re.compile(r".*prospector\(\) crashed at.*") - - with open(log_file_path, "r") as log_file, open( - output_file_path, "a" - ) as output_file: - for line in log_file: - if crash_pattern.match(line): - output_file.write(line) - - -# Usage -log_file_path = f"evaluation.log" -output_file_path = f"{ANALYSIS_RESULTS_PATH}error_lines.log" - -extract_crash_lines(log_file_path, output_file_path) -print(f"Error lines have been extracted to {output_file_path}") diff --git a/prospector/rules/rules.py b/prospector/rules/rules.py index aee22fbdb..cdc1d8d64 100644 --- a/prospector/rules/rules.py +++ b/prospector/rules/rules.py @@ -452,10 +452,10 @@ def apply( r.raise_for_status() commit_data = r.json()[0] - # is_security_relevant = commit_data.get("security_relevant") - # if is_security_relevant is not None: - # candidate.security_relevant = is_security_relevant - # return is_security_relevant + is_security_relevant = commit_data.get("security_relevant") + if is_security_relevant is not None: + candidate.security_relevant = is_security_relevant + return is_security_relevant candidate.security_relevant = LLMService().classify_commit( candidate.diff, candidate.repository, candidate.message