SAP · copernico · Jul 4, 2024 · Jun 13, 2024 · Jun 12, 2024 · Jun 13, 2024
diff --git a/prospector/README.md b/prospector/README.md
@@ -57,47 +57,64 @@ To quickly set up Prospector, follow these steps. This will run Prospector in it
 
 ### 🤖 LLM Support
 
-To use Prospector with LLM support, set the `use_llm_<...>` parameters in `config.yaml`. Additionally, you must specify required parameters for API access to the LLM. These parameters can vary depending on your choice of provider, please follow what fits your needs:
+To use Prospector with LLM support, you simply set required parameters for the API access to the LLM in *config.yaml*. These parameters can vary depending on your choice of provider, please follow what fits your needs (drop-downs below). If you do not want to use LLM support, keep the `llm_service` block in your *config.yaml* file commented out.
 
 <details><summary><b>Use SAP AI CORE SDK</b></summary>
 
-You will need the following parameters in `config.yaml`:
+You will need the following parameters in *config.yaml*:
 
 ```yaml
 llm_service:
     type: sap
     model_name: <model_name>
+    temperature: 0.0
+    ai_core_sk: <file_path>
 ```
 
-`<model_name>` refers to the model names available in the Generative AI Hub in SAP AI Core. [Here](https://github.tools.sap/I343697/generative-ai-hub-readme#1-supported-models) you can find an overview of available models.
+`<model_name>` refers to the model names available in the Generative AI Hub in SAP AI Core. You can find an overview of available models on the Generative AI Hub GitHub page.
 
 In `.env`, you must set the deployment URL as an environment variable following this naming convention:
 ```yaml
-<model_name (in capitals, and - changed to _)>_URL
+<model_name>_URL  # model name in capitals, and "-" changed to "_"
 ```
+For example, for gpt-4's deployment URL, set an environment variable called `GPT_4_URL`.
+
+The `temperature` parameter is optional. The default value is 0.0, but you can change it to something else.
+
+You also need to point the `ai_core_sk` parameter to a file contianing the secret keys.
 
 </details>
 
 <details><summary><b>Use personal third party provider</b></summary>
 
 Implemented third party providers are **OpenAI**, **Google** and **Mistral**.
 
-1. You will need the following parameters in `config.yaml`:
+1. You will need the following parameters in *config.yaml*:
     ```yaml
     llm_service:
         type: third_party
         model_name: <model_name>
+        temperature: 0.0
     ```
 
     `<model_name>` refers to the model names available, for example `gpt-4o` for OpenAI. You can find a lists of available models here:
    1. [OpenAI](https://platform.openai.com/docs/models)
    2. [Google](https://ai.google.dev/gemini-api/docs/models/gemini)
    3. [Mistral](https://docs.mistral.ai/getting-started/models/)
 
+    The `temperature` parameter is optional. The default value is 0.0, but you can change it to something else.
+
 2. Make sure to add your OpenAI API key to your `.env` file as `[OPENAI|GOOGLE|MISTRAL]_API_KEY`.
 
 </details>
 
+####
+
+You can set the `use_llm_<...>` parameters in *config.yaml* for fine-grained control over LLM support in various aspects of Prospector's phases. Each `use_llm_<...>` parameter allows you to enable or disable LLM support for a specific aspect:
+
+- **`use_llm_repository_url`**: Choose whether LLMs should be used to obtain the repository URL. When using this option, you can omit the `--repository` flag as a command line argument and run prospector with `./run_prospector.sh CVE-2020-1925`.
+
+
 ## 👩‍💻 Development Setup
 
 Following these steps allows you to run Prospector's components individually: [Backend database and worker containers](#starting-the-backend-database-and-the-job-workers), [RESTful Server](#starting-the-restful-server) for API endpoints, [Prospector CLI](#running-the-cli-version) and [Tests](#testing).
@@ -125,7 +142,7 @@ Afterwards, you will just have to set the environment variables using the `.env`
 set -a; source .env; set +a
 ```
 
-You can configure prospector from CLI or from the `config.yaml` file. The (recommended) API Keys for Github and the NVD can be configured from the `.env` file (which must then be sourced with `set -a; source .env; set +a`)
+You can configure prospector from CLI or from the *config.yaml* file. The (recommended) API Keys for Github and the NVD can be configured from the `.env` file (which must then be sourced with `set -a; source .env; set +a`)
 
 If at any time you wish to use a different version of the python interpreter, beware that the `requirements.txt` file contains the exact versioning for `python 3.10.6`.
 

diff --git a/prospector/cli/main.py b/prospector/cli/main.py
@@ -1,14 +1,12 @@
 #!/usr/bin/python3
-import logging
 import os
 import signal
 import sys
 from typing import Any, Dict
 
 from dotenv import load_dotenv
 
-import llm.operations as llm
-from llm.model_instantiation import create_model_instance
+from llm.llm_service import LLMService
 from util.http import ping_backend
 
 path_root = os.getcwd()
@@ -55,23 +53,32 @@ def main(argv):  # noqa: C901
             )
             return
 
-        # instantiate LLM model if set in config.yaml
-        if config.llm_service:
-            model = create_model_instance(llm_config=config.llm_service)
-
-        if not config.repository and not config.use_llm_repository_url:
-            logger.error(
-                "Either provide the repository URL or allow LLM usage to obtain it."
-            )
-            console.print(
-                "Either provide the repository URL or allow LLM usage to obtain it.",
-                status=MessageStatus.ERROR,
-            )
-            sys.exit(1)
-
         # if config.ping:
         #     return ping_backend(backend, get_level() < logging.INFO)
 
+        # Whether to use the LLMService
+        if config.llm_service:
+            if not config.repository and not config.llm_service.use_llm_repository_url:
+                logger.error(
+                    "Repository URL was neither specified nor allowed to obtain with LLM support. One must be set."
+                )
+                console.print(
+                    "Please set the `--repository` parameter or enable LLM support to infer the repository URL.",
+                    status=MessageStatus.ERROR,
+                )
+                return
+
+            # Create the LLMService singleton for later use
+            try:
+                LLMService(config.llm_service)
+            except Exception as e:
+                logger.error(f"Problem with LLMService instantiation: {e}")
+                console.print(
+                    "LLMService could not be created. Check logs.",
+                    status=MessageStatus.ERROR,
+                )
+                return
+
         config.pub_date = (
             config.pub_date + "T00:00:00Z" if config.pub_date is not None else ""
         )
@@ -81,9 +88,6 @@ def main(argv):  # noqa: C901
 
         logger.debug("Vulnerability ID: " + config.vuln_id)
 
-    if not config.repository:
-        config.repository = llm.get_repository_url(model=model, vuln_id=config.vuln_id)
-
     results, advisory_record = prospector(
         vulnerability_id=config.vuln_id,
         repository_url=config.repository,
@@ -99,6 +103,7 @@ def main(argv):  # noqa: C901
         git_cache=config.git_cache,
         limit_candidates=config.max_candidates,
         # ignore_adv_refs=config.ignore_refs,
+        use_llm_repository_url=config.llm_service.use_llm_repository_url,
     )
 
     if config.preprocess_only:

diff --git a/prospector/config-sample.yaml b/prospector/config-sample.yaml
@@ -28,12 +28,13 @@ database:
 redis_url: redis://redis:6379/0
 
 # LLM Usage (check README for help)
-llm_service:
-  type: sap # use "sap" or "third_party"
-  model_name: gpt-4-turbo
-  # temperature: 0.0 # optional, default is 0.0
+# llm_service:
+#   type: sap # use "sap" or "third_party"
+#   model_name: gpt-4-turbo
+#   temperature: 0.0 # optional, default is 0.0
+#   ai_core_sk: <file_path> # needed for type: sap
 
-use_llm_repository_url: True # whether to use LLM's to obtain the repository URL
+#   use_llm_repository_url: False # whether to use LLM's to obtain the repository URL
 
 # Report file format: "html", "json", "console" or "all"
 # and the file name

diff --git a/prospector/core/prospector.py b/prospector/core/prospector.py
@@ -5,7 +5,7 @@
 import re
 import sys
 import time
-from typing import Dict, List, Set, Tuple
+from typing import DefaultDict, Dict, List, Set, Tuple
 from urllib.parse import urlparse
 
 import requests
@@ -18,6 +18,7 @@
 from git.git import Git
 from git.raw_commit import RawCommit
 from git.version_to_tag import get_possible_tags
+from llm.llm_service import LLMService
 from log.logger import get_level, logger, pretty_log
 from rules.rules import apply_rules
 from stats.execution import (
@@ -51,7 +52,7 @@
 @measure_execution_time(execution_statistics, name="core")
 def prospector(  # noqa: C901
     vulnerability_id: str,
-    repository_url: str,
+    repository_url: str = None,
     publication_date: str = "",
     vuln_descr: str = "",
     version_interval: str = "",
@@ -68,6 +69,7 @@ def prospector(  # noqa: C901
     rules: List[str] = ["ALL"],
     tag_commits: bool = True,
     silent: bool = False,
+    use_llm_repository_url: bool = False,
 ) -> Tuple[List[Commit], AdvisoryRecord] | Tuple[int, int]:
     if silent:
         logger.disabled = True
@@ -89,7 +91,28 @@ def prospector(  # noqa: C901
     if advisory_record is None:
         return None, -1
 
-    fixing_commit = advisory_record.get_fixing_commit(repository_url)
+    if use_llm_repository_url:
+        with ConsoleWriter("LLM Usage (Repo URL)") as console:
+            try:
+                repository_url = LLMService().get_repository_url(
+                    advisory_record.description, advisory_record.references
+                )
+                console.print(
+                    f"\n  Repository URL: {repository_url}",
+                    status=MessageStatus.OK,
+                )
+            except Exception as e:
+                logger.error(
+                    e,
+                    exc_info=get_level() < logging.INFO,
+                )
+                console.print(
+                    e,
+                    status=MessageStatus.ERROR,
+                )
+                sys.exit(1)
+
+    fixing_commit = advisory_record.get_fixing_commit()
     # print(advisory_record.references)
     # obtain a repository object
     repository = Git(repository_url, git_cache)
@@ -140,9 +163,7 @@ def prospector(  # noqa: C901
     if len(candidates) > limit_candidates:
         logger.error(f"Number of candidates exceeds {limit_candidates}, aborting.")
 
-        ConsoleWriter.print(
-            f"Candidates limit exceeded: {len(candidates)}.",
-        )
+        ConsoleWriter.print(f"Candidates limitlimit exceeded: {len(candidates)}.")
         return None, len(candidates)
 
     with ExecutionTimer(
@@ -177,7 +198,10 @@ def prospector(  # noqa: C901
                 # preprocessed_commits += preprocess_commits(missing, timer)
 
                 pbar = tqdm(
-                    missing, desc="Processing commits", unit="commit", disable=silent
+                    missing,
+                    desc="Processing commits",
+                    unit="commit",
+                    disable=silent,
                 )
                 start_time = time.time()
                 with Counter(
@@ -319,7 +343,7 @@ def retrieve_preprocessed_commits(
             )
         ]
 
-        logger.error(f"Missing {len(missing)} commits")
+        logger.info(f"{len(missing)} commits not found in backend")
     commits = [Commit.parse_obj(rc) for rc in retrieved_commits]
     # Sets the tags
     # for commit in commits:

diff --git a/prospector/core/report_test.py b/prospector/core/report_test.py
@@ -2,7 +2,7 @@
 import os.path
 from random import randint
 
-import prospector.core.report as report
+import core.report as report
 from datamodel.advisory import build_advisory_record
 from datamodel.commit import Commit
 from util.sample_data_generation import (  # random_list_of_url,

diff --git a/prospector/datamodel/advisory.py b/prospector/datamodel/advisory.py
@@ -10,6 +10,7 @@
 import validators
 from dateutil.parser import isoparse
 
+from llm.llm_service import LLMService
 from log.logger import get_level, logger, pretty_log
 from util.http import extract_from_webpage, fetch_url, get_urls
 
@@ -69,6 +70,7 @@ def __init__(
         reserved_timestamp: int = 0,
         published_timestamp: int = 0,
         updated_timestamp: int = 0,
+        repository_url: str = None,
         references: DefaultDict[str, int] = None,
         affected_products: List[str] = None,
         versions: Dict[str, List[str]] = None,
@@ -81,6 +83,7 @@ def __init__(
         self.reserved_timestamp = reserved_timestamp
         self.published_timestamp = published_timestamp
         self.updated_timestamp = updated_timestamp
+        self.repository_url = repository_url
         self.references = references or defaultdict(lambda: 0)
         self.affected_products = affected_products or list()
         self.versions = versions or dict()
@@ -133,6 +136,7 @@ def parse_references_from_third_party(self):
             self.references[self.extract_hashes(ref)] += 2
 
     def get_advisory(self):
+        """Fills the advisory record with information obtained from an advisory API."""
         details, metadata = get_from_mitre(self.cve_id)
         if metadata is None:
             raise Exception("MITRE API Error")
@@ -176,7 +180,7 @@ def parse_advisory(self, data):
         ]
         self.versions["fixed"] = [v for v in self.versions["fixed"] if v is not None]
 
-    def get_fixing_commit(self, repository) -> List[str]:
+    def get_fixing_commit(self) -> List[str]:
         self.references = dict(
             sorted(self.references.items(), key=lambda item: item[1], reverse=True)
         )

diff --git a/prospector/git/git_test.py b/prospector/git/git_test.py
@@ -42,7 +42,8 @@ def test_get_tags_for_commit(repository: Git):
     commit = commits.get(OPENCAST_COMMIT)
     if commit is not None:
         tags = commit.find_tags()
-        assert len(tags) == 75
+        print(tags)
+        assert len(tags) >= 106
         assert "10.2" in tags and "11.3" in tags and "9.4" in tags
 
 

diff --git a/prospector/git/raw_commit_test.py b/prospector/git/raw_commit_test.py
@@ -26,7 +26,7 @@ def commit():
 
 def test_find_tags(commit: RawCommit):
     tags = commit.find_tags()
-    assert len(tags) == 75
+    assert len(tags) >= 106
     assert "10.2" in tags and "11.3" in tags and "9.4" in tags