Implements Rule Phases (#395)

1. A set of rules to apply can now be selected in config.yaml. Initially, it is set to all rules except the ones requiring llm_service (Phase 2 rules). 2. Rules are now applied in phases. All original Prospector rules are applied in "Phase 1" to all commits. Phase 2 applies its rules only to a subset of the ranked commits from Phase 1.
SAP · Jul 11, 2024 · eb8e552 · eb8e552
1 parent 2633695
commit eb8e552
Show file tree

Hide file tree

Showing 13 changed files with 188 additions and 154 deletions.
diff --git a/prospector/cli/main.py b/prospector/cli/main.py
@@ -68,7 +68,7 @@ def main(argv):  # noqa: C901
                 )
                 return
 
-            # Create the LLMService singleton for later use
+            # Create the LLMService Singleton for later use
             try:
                 LLMService(config.llm_service)
             except Exception as e:
@@ -104,6 +104,7 @@ def main(argv):  # noqa: C901
         limit_candidates=config.max_candidates,
         # ignore_adv_refs=config.ignore_refs,
         use_llm_repository_url=config.llm_service.use_llm_repository_url,
+        enabled_rules=config.enabled_rules,
     )
 
     if config.preprocess_only:

diff --git a/prospector/config-sample.yaml b/prospector/config-sample.yaml
@@ -36,6 +36,27 @@ redis_url: redis://redis:6379/0
 
 #   use_llm_repository_url: False # whether to use LLM's to obtain the repository URL
 
+enabled_rules:
+  # Phase 1 Rules
+  - VULN_ID_IN_MESSAGE
+  - XREF_BUG
+  - XREF_GH
+  - COMMIT_IN_REFERENCE
+  - VULN_ID_IN_LINKED_ISSUE
+  - CHANGES_RELEVANT_FILES
+  - CHANGES_RELEVANT_CODE
+  - RELEVANT_WORDS_IN_MESSAGE
+  - ADV_KEYWORDS_IN_FILES
+  - ADV_KEYWORDS_IN_MSG
+  - SEC_KEYWORDS_IN_MESSAGE
+  - SEC_KEYWORDS_IN_LINKED_GH
+  - SEC_KEYWORDS_IN_LINKED_BUG
+  - GITHUB_ISSUE_IN_MESSAGE
+  - BUG_IN_MESSAGE
+  - COMMIT_HAS_TWINS
+  # Phase 2 Rules (llm_service required!):
+  # - COMMIT_IS_SECURITY_RELEVANT
+
 # Report file format: "html", "json", "console" or "all"
 # and the file name
 report:

diff --git a/prospector/core/prospector.py b/prospector/core/prospector.py
@@ -2,25 +2,24 @@
 
 import logging
 import os
-import re
 import sys
 import time
-from typing import DefaultDict, Dict, List, Set, Tuple
+from typing import Dict, List, Set, Tuple
 from urllib.parse import urlparse
 
 import requests
 from tqdm import tqdm
 
 from cli.console import ConsoleWriter, MessageStatus
 from datamodel.advisory import AdvisoryRecord, build_advisory_record
-from datamodel.commit import Commit, apply_ranking, make_from_raw_commit
+from datamodel.commit import Commit, make_from_raw_commit
 from filtering.filter import filter_commits
 from git.git import Git
 from git.raw_commit import RawCommit
 from git.version_to_tag import get_possible_tags
 from llm.llm_service import LLMService
 from log.logger import get_level, logger, pretty_log
-from rules.rules import apply_rules
+from rules.rules import RULES_PHASE_1, apply_rules
 from stats.execution import (
     Counter,
     ExecutionTimer,
@@ -66,7 +65,7 @@ def prospector(  # noqa: C901
     use_backend: str = USE_BACKEND_ALWAYS,
     git_cache: str = "/tmp/git_cache",
     limit_candidates: int = MAX_CANDIDATES,
-    rules: List[str] = ["ALL"],
+    enabled_rules: List[str] = [rule.id for rule in RULES_PHASE_1],
     tag_commits: bool = True,
     silent: bool = False,
     use_llm_repository_url: bool = False,
@@ -231,7 +230,9 @@ def prospector(  # noqa: C901
     else:
         logger.warning("Preprocessed commits are not being sent to backend")
 
-    ranked_candidates = evaluate_commits(preprocessed_commits, advisory_record, rules)
+    ranked_candidates = evaluate_commits(
+        preprocessed_commits, advisory_record, enabled_rules
+    )
 
     # ConsoleWriter.print("Commit ranking and aggregation...")
     ranked_candidates = remove_twins(ranked_candidates)
@@ -267,11 +268,26 @@ def filter(commits: Dict[str, RawCommit]) -> Dict[str, RawCommit]:
 
 
 def evaluate_commits(
-    commits: List[Commit], advisory: AdvisoryRecord, rules: List[str]
+    commits: List[Commit], advisory: AdvisoryRecord, enabled_rules: List[str]
 ) -> List[Commit]:
+    """This method applies the rule phases. Each phase is associated with a set of rules:
+        - Phase 1: Original rules
+        - Phase 2: Rules using the LLMService
+
+    Args:
+        commits: the list of candidate commits that rules hsould be applied to
+        advisory: the object contianing all information about the advisory
+        enabled_rules: a (sub)set of rules to run (to set in config.yaml)
+
+    Returns:
+        a list of commits ranked according to their relevance score
+
+    Raises:
+        MissingMandatoryValue: if there is an error in the LLM configuration object
+    """
     with ExecutionTimer(core_statistics.sub_collection("candidates analysis")):
         with ConsoleWriter("Candidate analysis") as _:
-            ranked_commits = apply_ranking(apply_rules(commits, advisory, rules=rules))
+            ranked_commits = apply_rules(commits, advisory, enabled_rules=enabled_rules)
 
     return ranked_commits
 

diff --git a/prospector/datamodel/commit.py b/prospector/datamodel/commit.py
@@ -52,6 +52,7 @@ def __eq__(self, other: "Commit") -> bool:
         return self.relevance == other.relevance
 
     def add_match(self, rule: Dict[str, Any]):
+        """Adds a rule to the commit's matched rules. Makes sure that the rule is added in order of relevance."""
         for i, r in enumerate(self.matched_rules):
             if rule["relevance"] == r["relevance"]:
                 self.matched_rules.insert(i, rule)

diff --git a/prospector/llm/models/gemini.py b/prospector/llm/models/gemini.py
@@ -60,6 +60,7 @@ def _call(
 
         try:
             response = requests.post(endpoint, headers=headers, json=data)
+            response.raise_for_status()
             return self.parse(response.json())
         except requests.exceptions.HTTPError as http_error:
             logger.error(

diff --git a/prospector/llm/models/mistral.py b/prospector/llm/models/mistral.py
@@ -41,6 +41,7 @@ def _call(
 
         try:
             response = requests.post(endpoint, headers=headers, json=data)
+            response.raise_for_status()
             return self.parse(response.json())
         except requests.exceptions.HTTPError as http_error:
             logger.error(

diff --git a/prospector/llm/models/openai.py b/prospector/llm/models/openai.py
@@ -44,6 +44,7 @@ def _call(
 
         try:
             response = requests.post(endpoint, headers=headers, json=data)
+            response.raise_for_status()
             return self.parse(response.json())
         except requests.exceptions.HTTPError as http_error:
             logger.error(

diff --git a/prospector/requirements.in b/prospector/requirements.in
@@ -3,6 +3,7 @@ beautifulsoup4
 colorama
 datasketch
 fastapi
+google-cloud-aiplatform==1.49.0
 Jinja2
 langchain
 langchain_openai

diff --git a/prospector/requirements.txt b/prospector/requirements.txt
@@ -39,7 +39,7 @@ frozenlist==1.4.1
 fsspec==2024.6.0
 google-api-core==2.19.0
 google-auth==2.29.0
-google-cloud-aiplatform==1.53.0
+google-cloud-aiplatform==1.49.0
 google-cloud-bigquery==3.24.0
 google-cloud-core==2.4.1
 google-cloud-resource-manager==1.12.3

diff --git a/prospector/rules/rules.py b/prospector/rules/rules.py
@@ -2,23 +2,31 @@
 from abc import abstractmethod
 from typing import List, Tuple
 
+import requests
+
 from datamodel.advisory import AdvisoryRecord
-from datamodel.commit import Commit
-from datamodel.nlp import clean_string, find_similar_words
+from datamodel.commit import Commit, apply_ranking
+from llm.llm_service import LLMService
 from rules.helpers import extract_security_keywords
 from stats.execution import Counter, execution_statistics
 from util.lsh import build_lsh_index, decode_minhash
 
+NUM_COMMITS_PHASE_2 = (
+    10  # Determines how many candidates the second rule phase is applied to
+)
+
+
 rule_statistics = execution_statistics.sub_collection("rules")
 
 
 class Rule:
     lsh_index = None
+    llm_service: LLMService = None
 
     def __init__(self, id: str, relevance: int):
         self.id = id
-        self.message = ""
         self.relevance = relevance
+        self.message = ""
 
     @abstractmethod
     def apply(self, candidate: Commit, advisory_record: AdvisoryRecord) -> bool:
@@ -37,54 +45,50 @@ def as_dict(self):
     def get_rule_as_tuple(self) -> Tuple[str, str, int]:
         return (self.id, self.message, self.relevance)
 
+    def get_id(self):
+        return self.id
+
 
 def apply_rules(
     candidates: List[Commit],
     advisory_record: AdvisoryRecord,
-    rules=["ALL"],
+    enabled_rules: List[str] = [],
 ) -> List[Commit]:
-    enabled_rules = get_enabled_rules(rules)
+    """Applies the selected set of rules and returns the ranked list of commits."""
 
-    rule_statistics.collect("active", len(enabled_rules), unit="rules")
+    phase_1_rules = [rule for rule in RULES_PHASE_1 if rule.get_id() in enabled_rules]
+    phase_2_rules = [rule for rule in RULES_PHASE_2 if rule.get_id() in enabled_rules]
 
-    Rule.lsh_index = build_lsh_index()
+    if phase_2_rules:
+        Rule.llm_service = LLMService()
 
+    rule_statistics.collect(
+        "active", len(phase_1_rules) + len(phase_2_rules), unit="rules"
+    )
+
+    Rule.lsh_index = build_lsh_index()
     for candidate in candidates:
         Rule.lsh_index.insert(candidate.commit_id, decode_minhash(candidate.minhash))
 
     with Counter(rule_statistics) as counter:
         counter.initialize("matches", unit="matches")
         for candidate in candidates:
-            for rule in enabled_rules:
+            for rule in phase_1_rules:
                 if rule.apply(candidate, advisory_record):
                     counter.increment("matches")
                     candidate.add_match(rule.as_dict())
             candidate.compute_relevance()
 
-    # for candidate in candidates:
-    #     if candidate.has_twin():
-    #         for twin in candidate.twins:
-    #             for other_candidate in candidates:
-    #                 if (
-    #                     other_candidate.commit_id == twin[1]
-    #                     and other_candidate.relevance > candidate.relevance
-    #                 ):
-    #                     candidate.relevance = other_candidate.relevance
-    #                     # Add a reason on why we are doing this.
+        candidates = apply_ranking(candidates)
 
-    return candidates
-
-
-def get_enabled_rules(rules: List[str]) -> List[Rule]:
-    if "ALL" in rules:
-        return RULES
-
-    enabled_rules = []
-    for r in RULES:
-        if r.id in rules:
-            enabled_rules.append(r)
+        for candidate in candidates[:NUM_COMMITS_PHASE_2]:
+            for rule in phase_2_rules:
+                if rule.apply(candidate):
+                    counter.increment("matches")
+                    candidate.add_match(rule.as_dict())
+            candidate.compute_relevance()
 
-    return enabled_rules
+    return apply_ranking(candidates)
 
 
 # TODO: This could include issues, PRs, etc.
@@ -409,7 +413,7 @@ def apply(self, candidate: Commit, advisory_record: AdvisoryRecord):
         return False
 
 
-RULES: List[Rule] = [
+RULES_PHASE_1: List[Rule] = [
     VulnIdInMessage("VULN_ID_IN_MESSAGE", 64),
     # CommitMentionedInAdv("COMMIT_IN_ADVISORY", 64),
     CrossReferencedBug("XREF_BUG", 32),
@@ -429,23 +433,4 @@ def apply(self, candidate: Commit, advisory_record: AdvisoryRecord):
     CommitHasTwins("COMMIT_HAS_TWINS", 2),
 ]
 
-rules_list = [
-    "COMMIT_IN_REFERENCE",
-    "VULN_ID_IN_MESSAGE",
-    "VULN_ID_IN_LINKED_ISSUE",
-    "XREF_BUG",
-    "XREF_GH",
-    "CHANGES_RELEVANT_FILES",
-    "CHANGES_RELEVANT_CODE",
-    "RELEVANT_WORDS_IN_MESSAGE",
-    "ADV_KEYWORDS_IN_FILES",
-    "ADV_KEYWORDS_IN_MSG",
-    "SEC_KEYWORDS_IN_MESSAGE",
-    "SEC_KEYWORDS_IN_LINKED_GH",
-    "SEC_KEYWORDS_IN_LINKED_BUG",
-    "GITHUB_ISSUE_IN_MESSAGE",
-    "BUG_IN_MESSAGE",
-    "COMMIT_HAS_TWINS",
-]
-
-# print(" & ".join([f"\\rot{{{x}}}" for x in rules_list]))
+RULES_PHASE_2: List[Rule] = []