SAP · lauraschauer · Jul 2, 2024 · Jun 11, 2024 · Jun 11, 2024 · Jun 11, 2024
diff --git a/prospector/README.md b/prospector/README.md
@@ -55,6 +55,7 @@ To quickly set up Prospector, follow these steps. This will run Prospector in it
     By default, Prospector saves the results in a HTML file named *prospector-report.html*.
     Open this file in a web browser to view what Prospector was able to find!
 
+
 ### 🤖 LLM Support
 
 To use Prospector with LLM support, you simply set required parameters for the API access to the LLM in *config.yaml*. These parameters can vary depending on your choice of provider, please follow what fits your needs (drop-downs below). If you do not want to use LLM support, keep the `llm_service` block in your *config.yaml* file commented out.

diff --git a/prospector/cli/main.py b/prospector/cli/main.py
@@ -68,7 +68,7 @@ def main(argv):  # noqa: C901
                 )
                 return
 
-            # Create the LLMService singleton for later use
+            # Create the LLMService Singleton for later use
             try:
                 LLMService(config.llm_service)
             except Exception as e:
@@ -104,6 +104,7 @@ def main(argv):  # noqa: C901
         limit_candidates=config.max_candidates,
         # ignore_adv_refs=config.ignore_refs,
         use_llm_repository_url=config.llm_service.use_llm_repository_url,
+        enabled_rules=config.enabled_rules,
     )
 
     if config.preprocess_only:

diff --git a/prospector/config-sample.yaml b/prospector/config-sample.yaml
@@ -36,6 +36,27 @@ redis_url: redis://redis:6379/0
 
 #   use_llm_repository_url: False # whether to use LLM's to obtain the repository URL
 
+enabled_rules:
+  # Phase 1 Rules
+  - VULN_ID_IN_MESSAGE
+  - XREF_BUG
+  - XREF_GH
+  - COMMIT_IN_REFERENCE
+  - VULN_ID_IN_LINKED_ISSUE
+  - CHANGES_RELEVANT_FILES
+  - CHANGES_RELEVANT_CODE
+  - RELEVANT_WORDS_IN_MESSAGE
+  - ADV_KEYWORDS_IN_FILES
+  - ADV_KEYWORDS_IN_MSG
+  - SEC_KEYWORDS_IN_MESSAGE
+  - SEC_KEYWORDS_IN_LINKED_GH
+  - SEC_KEYWORDS_IN_LINKED_BUG
+  - GITHUB_ISSUE_IN_MESSAGE
+  - BUG_IN_MESSAGE
+  - COMMIT_HAS_TWINS
+  # Phase 2 Rules (llm_service required!):
+  # - COMMIT_IS_SECURITY_RELEVANT
+
 # Report file format: "html", "json", "console" or "all"
 # and the file name
 report:

diff --git a/prospector/core/prospector.py b/prospector/core/prospector.py
@@ -13,14 +13,14 @@
 
 from cli.console import ConsoleWriter, MessageStatus
 from datamodel.advisory import AdvisoryRecord, build_advisory_record
-from datamodel.commit import Commit, apply_ranking, make_from_raw_commit
+from datamodel.commit import Commit, make_from_raw_commit
 from filtering.filter import filter_commits
 from git.git import Git
 from git.raw_commit import RawCommit
 from git.version_to_tag import get_possible_tags
 from llm.llm_service import LLMService
 from log.logger import get_level, logger, pretty_log
-from rules.rules import apply_rules
+from rules.rules import RULES_PHASE_1, apply_rules
 from stats.execution import (
     Counter,
     ExecutionTimer,
@@ -66,7 +66,7 @@ def prospector(  # noqa: C901
     use_backend: str = USE_BACKEND_ALWAYS,
     git_cache: str = "/tmp/git_cache",
     limit_candidates: int = MAX_CANDIDATES,
-    rules: List[str] = ["ALL"],
+    enabled_rules: List[str] = [rule.id for rule in RULES_PHASE_1],
     tag_commits: bool = True,
     silent: bool = False,
     use_llm_repository_url: bool = False,
@@ -231,7 +231,9 @@ def prospector(  # noqa: C901
     else:
         logger.warning("Preprocessed commits are not being sent to backend")
 
-    ranked_candidates = evaluate_commits(preprocessed_commits, advisory_record, rules)
+    ranked_candidates = evaluate_commits(
+        preprocessed_commits, advisory_record, enabled_rules
+    )
 
     # ConsoleWriter.print("Commit ranking and aggregation...")
     ranked_candidates = remove_twins(ranked_candidates)
@@ -267,11 +269,26 @@ def filter(commits: Dict[str, RawCommit]) -> Dict[str, RawCommit]:
 
 
 def evaluate_commits(
-    commits: List[Commit], advisory: AdvisoryRecord, rules: List[str]
+    commits: List[Commit],
+    advisory: AdvisoryRecord,
+    enabled_rules: List[str],
 ) -> List[Commit]:
+    """This function applies rule phases. Each phase is associated with a set of rules, for example:
+        - Phase 1: NLP Rules
+        - Phase 2: LLM Rules
+
+    Args:
+        commits: the list of candidate commits that rules should be applied to
+        advisory: the object containing all information about the advisory
+        rules: a (sub)set of rules to run
+    Returns:
+        a list of commits ranked according to their relevance score
+    Raises:
+        MissingMandatoryValue: if there is an error in the LLM configuration object
+    """
     with ExecutionTimer(core_statistics.sub_collection("candidates analysis")):
         with ConsoleWriter("Candidate analysis") as _:
-            ranked_commits = apply_ranking(apply_rules(commits, advisory, rules=rules))
+            ranked_commits = apply_rules(commits, advisory, enabled_rules=enabled_rules)
 
     return ranked_commits
 

diff --git a/prospector/datamodel/commit.py b/prospector/datamodel/commit.py
@@ -52,6 +52,7 @@ def __eq__(self, other: "Commit") -> bool:
         return self.relevance == other.relevance
 
     def add_match(self, rule: Dict[str, Any]):
+        """Adds rule to the commit's matched rules. Makes sure that the rule is added in order of relevance."""
         for i, r in enumerate(self.matched_rules):
             if rule["relevance"] == r["relevance"]:
                 self.matched_rules.insert(i, rule)

diff --git a/prospector/evaluation/__init__.py b/prospector/evaluation/__init__.py
diff --git a/prospector/evaluation/analyse.py b/prospector/evaluation/analyse.py
@@ -0,0 +1,31 @@
+import os
+import json
+
+directory = "data_sources/reports/"
+
+# Now analyse the reports
+for filename in os.listdir(directory):
+    filepath = directory + filename
+    with open(filepath, "r") as f:
+        data = json.load(f)
+
+    if not data:
+        print("Error occured, JSON file could not be found.")
+
+    results = {
+        "relevance": [],
+        "no_llm_rule_match": [],
+    }
+
+    print(data["commits"][0])
+
+    for commit in data["commits"]:
+        results["relevance"].append(
+            {
+                commit["commit_id"]: sum(
+                    [rule["relevance"] for rule in commit["matched_rules"]]
+                )
+            }
+        )
+        if commit["matched_rules"]:
+            print(commit["matched_rules"][0]["relevance"])
diff --git a/prospector/evaluation/data_interaction.py b/prospector/evaluation/data_interaction.py
@@ -0,0 +1,39 @@
+import json
+from datetime import datetime
+
+from data_sources.nvd.filter_entries import (
+    find_matching_entries_test,
+    get_cve_by_id,
+    get_cves,
+)
+
+
+FILEPATH_SINGLE_CVE = "evaluation/single_cve.json"
+FILEPATH_MULTIPLE_CVES = "evaluation/multiple_cves.json"
+
+
+def save_single_cve():
+    with open(FILEPATH_SINGLE_CVE, "w") as f:
+        data = get_cve_by_id("CVE-2020-1925")
+        filtered_cves = find_matching_entries_test(data)
+        json.dump(filtered_cves, f)
+        print("Saved a single CVEs.")
+
+
+def save_multiple_cves():
+    with open(FILEPATH_MULTIPLE_CVES, "w") as f:
+        data = get_cves(10)
+        filtered_cves = find_matching_entries_test(data)
+        json.dump(filtered_cves, f)
+        print("Saved multiple CVEs.")
+
+
+def load_single_cve():
+    with open(FILEPATH_SINGLE_CVE, "r") as f:
+        json_data = json.load(f)
+        return json_data
+
+
+def load_multiple_cves():
+    with open(FILEPATH_MULTIPLE_CVES, "r") as f:
+        return json.load(f)
diff --git a/prospector/evaluation/dispatch_jobs.py b/prospector/evaluation/dispatch_jobs.py
@@ -0,0 +1,35 @@
+import json
+from datetime import datetime
+
+from data_sources.nvd.job_creation import create_prospector_job
+from evaluation.data_interaction import (
+    load_multiple_cves,
+    load_single_cve,
+    save_multiple_cves,
+    save_single_cve,
+)
+from util.report_analyzer import analyze_commit_relevance_results
+
+
+# # Save CVE Data
+# save_single_cve()
+# # Load CVE Data
+# cves = load_single_cve()
+
+# save_multiple_cves()
+cves = load_multiple_cves()
+
+for cve in cves:
+    print(cve["nvd_info"]["cve"]["id"])
+    # print(cve)
+
+# Send them to Prospector to run & save results to data_source/reports/<cve_id>
+for cve in cves:
+    res = create_prospector_job(
+        repository_url=cve["repo_url"],
+        cve_id=cve["nvd_info"]["cve"]["id"],
+        report_type="json",
+        version_interval=cve["version_interval"],
+    )  # Creates .json files for each CVE in app/data_sources/reports
+    # if res["job_data"]["job_status"]:
+    #     reported_cves.append(cves["vulnerabilities"][0]["cve"]["id"])
diff --git a/prospector/evaluation/get_cve_results.json b/prospector/evaluation/get_cve_results.json
diff --git a/prospector/evaluation/multiple_cves.json b/prospector/evaluation/multiple_cves.json
diff --git a/prospector/evaluation/single_cve.json b/prospector/evaluation/single_cve.json
@@ -0,0 +1 @@
+[{"nvd_info": {"cve": {"id": "CVE-2020-1925", "sourceIdentifier": "[email protected]", "published": "2020-01-09T19:15:10.807", "lastModified": "2020-01-15T14:26:32.803", "vulnStatus": "Analyzed", "descriptions": [{"lang": "en", "value": "Apache Olingo versions 4.0.0 to 4.7.0 provide the AsyncRequestWrapperImpl class which reads a URL from the Location header, and then sends a GET or DELETE request to this URL. It may allow to implement a SSRF attack. If an attacker tricks a client to connect to a malicious server, the server can make the client call any URL including internal resources which are not directly accessible by the attacker."}, {"lang": "es", "value": "Apache Olingo versiones 4.0.0 hasta 4.7.0, proporcionan la clase AsyncRequestWrapperImpl que lee una URL del encabezado Location y entonces env\u00eda una petici\u00f3n GET o DELETE a esta URL. Puede permitir implementar un ataque de tipo SSRF. Si un atacante enga\u00f1a a un cliente para que conecte con un servidor malicioso, el servidor puede hacer que el cliente llame a cualquier URL, incluyendo los recursos internos que no son accesibles directamente por el atacante."}], "metrics": {"cvssMetricV31": [{"source": "[email protected]", "type": "Primary", "cvssData": {"version": "3.1", "vectorString": "CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:N/A:N", "attackVector": "NETWORK", "attackComplexity": "LOW", "privilegesRequired": "NONE", "userInteraction": "NONE", "scope": "UNCHANGED", "confidentialityImpact": "HIGH", "integrityImpact": "NONE", "availabilityImpact": "NONE", "baseScore": 7.5, "baseSeverity": "HIGH"}, "exploitabilityScore": 3.9, "impactScore": 3.6}], "cvssMetricV2": [{"source": "[email protected]", "type": "Primary", "cvssData": {"version": "2.0", "vectorString": "AV:N/AC:L/Au:N/C:P/I:N/A:N", "accessVector": "NETWORK", "accessComplexity": "LOW", "authentication": "NONE", "confidentialityImpact": "PARTIAL", "integrityImpact": "NONE", "availabilityImpact": "NONE", "baseScore": 5.0}, "baseSeverity": "MEDIUM", "exploitabilityScore": 10.0, "impactScore": 2.9, "acInsufInfo": false, "obtainAllPrivilege": false, "obtainUserPrivilege": false, "obtainOtherPrivilege": false, "userInteractionRequired": false}]}, "weaknesses": [{"source": "[email protected]", "type": "Primary", "description": [{"lang": "en", "value": "CWE-918"}]}], "configurations": [{"nodes": [{"operator": "OR", "negate": false, "cpeMatch": [{"vulnerable": true, "criteria": "cpe:2.3:a:apache:olingo:*:*:*:*:*:*:*:*", "versionStartIncluding": "4.0.0", "versionEndIncluding": "4.7.0", "matchCriteriaId": "3303BD0F-10CA-4290-BC41-8653279BA978"}]}]}], "references": [{"url": "https://mail-archives.apache.org/mod_mbox/olingo-user/202001.mbox/%3CCAGSZ4d6HwpF2woOrZJg_d0SkHytXJaCtAWXa3ZtBn33WG0YFvw%40mail.gmail.com%3E", "source": "[email protected]", "tags": ["Patch", "Third Party Advisory"]}]}}, "repo_url": "https://github.com/apache/olingo-odata4", "version_interval": "4.0.0:4.7.0"}]
diff --git a/prospector/llm/llm_service.py b/prospector/llm/llm_service.py
@@ -3,9 +3,12 @@
 import validators
 from langchain_core.language_models.llms import LLM
 from langchain_core.output_parsers import StrOutputParser
+from requests import HTTPError
 
+from datamodel.commit import Commit
 from llm.instantiation import create_model_instance
-from llm.prompts import prompt_best_guess
+from llm.prompts.classify_commit import zero_shot as cc_zero_shot
+from llm.prompts.get_repository_url import prompt_best_guess
 from log.logger import logger
 from util.config_parser import LLMServiceConfig
 from util.singleton import Singleton
@@ -74,3 +77,45 @@ def get_repository_url(self, advisory_description, advisory_references) -> str:
             raise RuntimeError(f"Prompt-model chain could not be invoked: {e}")
 
         return url
+
+    def classify_commit(
+        self, diff: str, repository_name: str, commit_message: str
+    ) -> bool:
+        """Ask an LLM whether a commit is security relevant or not. The response will be either True or False.
+
+        Args:
+            candidate (Commit): The commit to input into the LLM
+
+        Returns:
+            True if the commit is deemed security relevant, False if not.
+
+        Raises:
+            ValueError if there is an error in the model invocation or the response was not valid.
+        """
+        try:
+            chain = cc_zero_shot | self.model | StrOutputParser()
+
+            is_relevant = chain.invoke(
+                {
+                    "diff": diff,
+                    "repository_name": repository_name,
+                    "commit_message": commit_message,
+                }
+            )
+            logger.info(f"LLM returned is_relevant={is_relevant}")
+
+        except HTTPError as e:
+            # if the diff is too big, a 400 error is returned -> silently ignore by returning False for this commit
+            status_code = e.response.status_code
+            if status_code == 400:
+                return False
+            raise RuntimeError(f"Prompt-model chain could not be invoked: {e}")
+        except Exception as e:
+            raise RuntimeError(f"Prompt-model chain could not be invoked: {e}")
+
+        if is_relevant == "True":
+            return True
+        elif is_relevant == "False":
+            return False
+        else:
+            raise RuntimeError(f"The model returned an invalid response: {is_relevant}")
diff --git a/prospector/llm/models/gemini.py b/prospector/llm/models/gemini.py
@@ -60,6 +60,7 @@ def _call(
 
         try:
             response = requests.post(endpoint, headers=headers, json=data)
+            response.raise_for_status()
             return self.parse(response.json())
         except requests.exceptions.HTTPError as http_error:
             logger.error(

diff --git a/prospector/llm/models/mistral.py b/prospector/llm/models/mistral.py
@@ -41,6 +41,7 @@ def _call(
 
         try:
             response = requests.post(endpoint, headers=headers, json=data)
+            response.raise_for_status()
             return self.parse(response.json())
         except requests.exceptions.HTTPError as http_error:
             logger.error(

diff --git a/prospector/llm/models/openai.py b/prospector/llm/models/openai.py
@@ -44,6 +44,7 @@ def _call(
 
         try:
             response = requests.post(endpoint, headers=headers, json=data)
+            response.raise_for_status()
             return self.parse(response.json())
         except requests.exceptions.HTTPError as http_error:
             logger.error(

diff --git a/prospector/llm/prompts/classify_commit.py b/prospector/llm/prompts/classify_commit.py
@@ -0,0 +1,16 @@
+from langchain.prompts import PromptTemplate
+
+zero_shot = PromptTemplate.from_template(
+    """Is the following commit security relevant or not?
+Please provide the output as a boolean value: ```ANSWER:```
+If it is security relevant just answer ```ANSWER:True``` otherwise answer ```ANSWER:False```.
+
+To provide you with some context, the name of the repository is: {repository_name}, and the
+commit message is: {commit_message}.
+
+Finally, here is the diff of the commit:
+{diff}\n
+
+
+```ANSWER: ```\n"""
+)
diff --git a/prospector/llm/prompts.py → prospector/llm/prompts/get_repository_url.py b/prospector/llm/prompts.py → prospector/llm/prompts/get_repository_url.py
diff --git a/prospector/requirements.in b/prospector/requirements.in
@@ -3,6 +3,7 @@ beautifulsoup4
 colorama
 datasketch
 fastapi
+google-cloud-aiplatform==1.49.0
 Jinja2
 langchain
 langchain_openai

diff --git a/prospector/requirements.txt b/prospector/requirements.txt
@@ -39,7 +39,7 @@ frozenlist==1.4.1
 fsspec==2024.6.0
 google-api-core==2.19.0
 google-auth==2.29.0
-google-cloud-aiplatform==1.53.0
+google-cloud-aiplatform==1.49.0
 google-cloud-bigquery==3.24.0
 google-cloud-core==2.4.1
 google-cloud-resource-manager==1.12.3
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		[{"nvd_info": {"cve": {"id": "CVE-2020-1925", "sourceIdentifier": "[email protected]", "published": "2020-01-09T19:15:10.807", "lastModified": "2020-01-15T14:26:32.803", "vulnStatus": "Analyzed", "descriptions": [{"lang": "en", "value": "Apache Olingo versions 4.0.0 to 4.7.0 provide the AsyncRequestWrapperImpl class which reads a URL from the Location header, and then sends a GET or DELETE request to this URL. It may allow to implement a SSRF attack. If an attacker tricks a client to connect to a malicious server, the server can make the client call any URL including internal resources which are not directly accessible by the attacker."}, {"lang": "es", "value": "Apache Olingo versiones 4.0.0 hasta 4.7.0, proporcionan la clase AsyncRequestWrapperImpl que lee una URL del encabezado Location y entonces env\u00eda una petici\u00f3n GET o DELETE a esta URL. Puede permitir implementar un ataque de tipo SSRF. Si un atacante enga\u00f1a a un cliente para que conecte con un servidor malicioso, el servidor puede hacer que el cliente llame a cualquier URL, incluyendo los recursos internos que no son accesibles directamente por el atacante."}], "metrics": {"cvssMetricV31": [{"source": "[email protected]", "type": "Primary", "cvssData": {"version": "3.1", "vectorString": "CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:N/A:N", "attackVector": "NETWORK", "attackComplexity": "LOW", "privilegesRequired": "NONE", "userInteraction": "NONE", "scope": "UNCHANGED", "confidentialityImpact": "HIGH", "integrityImpact": "NONE", "availabilityImpact": "NONE", "baseScore": 7.5, "baseSeverity": "HIGH"}, "exploitabilityScore": 3.9, "impactScore": 3.6}], "cvssMetricV2": [{"source": "[email protected]", "type": "Primary", "cvssData": {"version": "2.0", "vectorString": "AV:N/AC:L/Au:N/C:P/I:N/A:N", "accessVector": "NETWORK", "accessComplexity": "LOW", "authentication": "NONE", "confidentialityImpact": "PARTIAL", "integrityImpact": "NONE", "availabilityImpact": "NONE", "baseScore": 5.0}, "baseSeverity": "MEDIUM", "exploitabilityScore": 10.0, "impactScore": 2.9, "acInsufInfo": false, "obtainAllPrivilege": false, "obtainUserPrivilege": false, "obtainOtherPrivilege": false, "userInteractionRequired": false}]}, "weaknesses": [{"source": "[email protected]", "type": "Primary", "description": [{"lang": "en", "value": "CWE-918"}]}], "configurations": [{"nodes": [{"operator": "OR", "negate": false, "cpeMatch": [{"vulnerable": true, "criteria": "cpe:2.3:a:apache:olingo::::::::", "versionStartIncluding": "4.0.0", "versionEndIncluding": "4.7.0", "matchCriteriaId": "3303BD0F-10CA-4290-BC41-8653279BA978"}]}]}], "references": [{"url": "https://mail-archives.apache.org/mod_mbox/olingo-user/202001.mbox/%3CCAGSZ4d6HwpF2woOrZJg_d0SkHytXJaCtAWXa3ZtBn33WG0YFvw%40mail.gmail.com%3E", "source": "[email protected]", "tags": ["Patch", "Third Party Advisory"]}]}}, "repo_url": "https://github.com/apache/olingo-odata4", "version_interval": "4.0.0:4.7.0"}]