Skip to content

Commit

Permalink
Implements Rule Phases (#395)
Browse files Browse the repository at this point in the history
1. A set of rules to apply can now be selected in config.yaml.
Initially, it is set to all rules except the ones requiring llm_service
(Phase 2 rules).
2. Rules are now applied in phases. All original Prospector rules are
applied in "Phase 1" to all commits. Phase 2 applies its rules only to a
subset of the ranked commits from Phase 1.
  • Loading branch information
lauraschauer authored Jul 11, 2024
1 parent 2633695 commit eb8e552
Show file tree
Hide file tree
Showing 13 changed files with 188 additions and 154 deletions.
3 changes: 2 additions & 1 deletion prospector/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def main(argv): # noqa: C901
)
return

# Create the LLMService singleton for later use
# Create the LLMService Singleton for later use
try:
LLMService(config.llm_service)
except Exception as e:
Expand Down Expand Up @@ -104,6 +104,7 @@ def main(argv): # noqa: C901
limit_candidates=config.max_candidates,
# ignore_adv_refs=config.ignore_refs,
use_llm_repository_url=config.llm_service.use_llm_repository_url,
enabled_rules=config.enabled_rules,
)

if config.preprocess_only:
Expand Down
21 changes: 21 additions & 0 deletions prospector/config-sample.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,27 @@ redis_url: redis://redis:6379/0

# use_llm_repository_url: False # whether to use LLM's to obtain the repository URL

enabled_rules:
# Phase 1 Rules
- VULN_ID_IN_MESSAGE
- XREF_BUG
- XREF_GH
- COMMIT_IN_REFERENCE
- VULN_ID_IN_LINKED_ISSUE
- CHANGES_RELEVANT_FILES
- CHANGES_RELEVANT_CODE
- RELEVANT_WORDS_IN_MESSAGE
- ADV_KEYWORDS_IN_FILES
- ADV_KEYWORDS_IN_MSG
- SEC_KEYWORDS_IN_MESSAGE
- SEC_KEYWORDS_IN_LINKED_GH
- SEC_KEYWORDS_IN_LINKED_BUG
- GITHUB_ISSUE_IN_MESSAGE
- BUG_IN_MESSAGE
- COMMIT_HAS_TWINS
# Phase 2 Rules (llm_service required!):
# - COMMIT_IS_SECURITY_RELEVANT

# Report file format: "html", "json", "console" or "all"
# and the file name
report:
Expand Down
32 changes: 24 additions & 8 deletions prospector/core/prospector.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,24 @@

import logging
import os
import re
import sys
import time
from typing import DefaultDict, Dict, List, Set, Tuple
from typing import Dict, List, Set, Tuple
from urllib.parse import urlparse

import requests
from tqdm import tqdm

from cli.console import ConsoleWriter, MessageStatus
from datamodel.advisory import AdvisoryRecord, build_advisory_record
from datamodel.commit import Commit, apply_ranking, make_from_raw_commit
from datamodel.commit import Commit, make_from_raw_commit
from filtering.filter import filter_commits
from git.git import Git
from git.raw_commit import RawCommit
from git.version_to_tag import get_possible_tags
from llm.llm_service import LLMService
from log.logger import get_level, logger, pretty_log
from rules.rules import apply_rules
from rules.rules import RULES_PHASE_1, apply_rules
from stats.execution import (
Counter,
ExecutionTimer,
Expand Down Expand Up @@ -66,7 +65,7 @@ def prospector( # noqa: C901
use_backend: str = USE_BACKEND_ALWAYS,
git_cache: str = "/tmp/git_cache",
limit_candidates: int = MAX_CANDIDATES,
rules: List[str] = ["ALL"],
enabled_rules: List[str] = [rule.id for rule in RULES_PHASE_1],
tag_commits: bool = True,
silent: bool = False,
use_llm_repository_url: bool = False,
Expand Down Expand Up @@ -231,7 +230,9 @@ def prospector( # noqa: C901
else:
logger.warning("Preprocessed commits are not being sent to backend")

ranked_candidates = evaluate_commits(preprocessed_commits, advisory_record, rules)
ranked_candidates = evaluate_commits(
preprocessed_commits, advisory_record, enabled_rules
)

# ConsoleWriter.print("Commit ranking and aggregation...")
ranked_candidates = remove_twins(ranked_candidates)
Expand Down Expand Up @@ -267,11 +268,26 @@ def filter(commits: Dict[str, RawCommit]) -> Dict[str, RawCommit]:


def evaluate_commits(
commits: List[Commit], advisory: AdvisoryRecord, rules: List[str]
commits: List[Commit], advisory: AdvisoryRecord, enabled_rules: List[str]
) -> List[Commit]:
"""This method applies the rule phases. Each phase is associated with a set of rules:
- Phase 1: Original rules
- Phase 2: Rules using the LLMService
Args:
commits: the list of candidate commits that rules hsould be applied to
advisory: the object contianing all information about the advisory
enabled_rules: a (sub)set of rules to run (to set in config.yaml)
Returns:
a list of commits ranked according to their relevance score
Raises:
MissingMandatoryValue: if there is an error in the LLM configuration object
"""
with ExecutionTimer(core_statistics.sub_collection("candidates analysis")):
with ConsoleWriter("Candidate analysis") as _:
ranked_commits = apply_ranking(apply_rules(commits, advisory, rules=rules))
ranked_commits = apply_rules(commits, advisory, enabled_rules=enabled_rules)

return ranked_commits

Expand Down
1 change: 1 addition & 0 deletions prospector/datamodel/commit.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def __eq__(self, other: "Commit") -> bool:
return self.relevance == other.relevance

def add_match(self, rule: Dict[str, Any]):
"""Adds a rule to the commit's matched rules. Makes sure that the rule is added in order of relevance."""
for i, r in enumerate(self.matched_rules):
if rule["relevance"] == r["relevance"]:
self.matched_rules.insert(i, rule)
Expand Down
1 change: 1 addition & 0 deletions prospector/llm/models/gemini.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def _call(

try:
response = requests.post(endpoint, headers=headers, json=data)
response.raise_for_status()
return self.parse(response.json())
except requests.exceptions.HTTPError as http_error:
logger.error(
Expand Down
1 change: 1 addition & 0 deletions prospector/llm/models/mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def _call(

try:
response = requests.post(endpoint, headers=headers, json=data)
response.raise_for_status()
return self.parse(response.json())
except requests.exceptions.HTTPError as http_error:
logger.error(
Expand Down
1 change: 1 addition & 0 deletions prospector/llm/models/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def _call(

try:
response = requests.post(endpoint, headers=headers, json=data)
response.raise_for_status()
return self.parse(response.json())
except requests.exceptions.HTTPError as http_error:
logger.error(
Expand Down
1 change: 1 addition & 0 deletions prospector/requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ beautifulsoup4
colorama
datasketch
fastapi
google-cloud-aiplatform==1.49.0
Jinja2
langchain
langchain_openai
Expand Down
2 changes: 1 addition & 1 deletion prospector/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ frozenlist==1.4.1
fsspec==2024.6.0
google-api-core==2.19.0
google-auth==2.29.0
google-cloud-aiplatform==1.53.0
google-cloud-aiplatform==1.49.0
google-cloud-bigquery==3.24.0
google-cloud-core==2.4.1
google-cloud-resource-manager==1.12.3
Expand Down
87 changes: 36 additions & 51 deletions prospector/rules/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,31 @@
from abc import abstractmethod
from typing import List, Tuple

import requests

from datamodel.advisory import AdvisoryRecord
from datamodel.commit import Commit
from datamodel.nlp import clean_string, find_similar_words
from datamodel.commit import Commit, apply_ranking
from llm.llm_service import LLMService
from rules.helpers import extract_security_keywords
from stats.execution import Counter, execution_statistics
from util.lsh import build_lsh_index, decode_minhash

NUM_COMMITS_PHASE_2 = (
10 # Determines how many candidates the second rule phase is applied to
)


rule_statistics = execution_statistics.sub_collection("rules")


class Rule:
lsh_index = None
llm_service: LLMService = None

def __init__(self, id: str, relevance: int):
self.id = id
self.message = ""
self.relevance = relevance
self.message = ""

@abstractmethod
def apply(self, candidate: Commit, advisory_record: AdvisoryRecord) -> bool:
Expand All @@ -37,54 +45,50 @@ def as_dict(self):
def get_rule_as_tuple(self) -> Tuple[str, str, int]:
return (self.id, self.message, self.relevance)

def get_id(self):
return self.id


def apply_rules(
candidates: List[Commit],
advisory_record: AdvisoryRecord,
rules=["ALL"],
enabled_rules: List[str] = [],
) -> List[Commit]:
enabled_rules = get_enabled_rules(rules)
"""Applies the selected set of rules and returns the ranked list of commits."""

rule_statistics.collect("active", len(enabled_rules), unit="rules")
phase_1_rules = [rule for rule in RULES_PHASE_1 if rule.get_id() in enabled_rules]
phase_2_rules = [rule for rule in RULES_PHASE_2 if rule.get_id() in enabled_rules]

Rule.lsh_index = build_lsh_index()
if phase_2_rules:
Rule.llm_service = LLMService()

rule_statistics.collect(
"active", len(phase_1_rules) + len(phase_2_rules), unit="rules"
)

Rule.lsh_index = build_lsh_index()
for candidate in candidates:
Rule.lsh_index.insert(candidate.commit_id, decode_minhash(candidate.minhash))

with Counter(rule_statistics) as counter:
counter.initialize("matches", unit="matches")
for candidate in candidates:
for rule in enabled_rules:
for rule in phase_1_rules:
if rule.apply(candidate, advisory_record):
counter.increment("matches")
candidate.add_match(rule.as_dict())
candidate.compute_relevance()

# for candidate in candidates:
# if candidate.has_twin():
# for twin in candidate.twins:
# for other_candidate in candidates:
# if (
# other_candidate.commit_id == twin[1]
# and other_candidate.relevance > candidate.relevance
# ):
# candidate.relevance = other_candidate.relevance
# # Add a reason on why we are doing this.
candidates = apply_ranking(candidates)

return candidates


def get_enabled_rules(rules: List[str]) -> List[Rule]:
if "ALL" in rules:
return RULES

enabled_rules = []
for r in RULES:
if r.id in rules:
enabled_rules.append(r)
for candidate in candidates[:NUM_COMMITS_PHASE_2]:
for rule in phase_2_rules:
if rule.apply(candidate):
counter.increment("matches")
candidate.add_match(rule.as_dict())
candidate.compute_relevance()

return enabled_rules
return apply_ranking(candidates)


# TODO: This could include issues, PRs, etc.
Expand Down Expand Up @@ -409,7 +413,7 @@ def apply(self, candidate: Commit, advisory_record: AdvisoryRecord):
return False


RULES: List[Rule] = [
RULES_PHASE_1: List[Rule] = [
VulnIdInMessage("VULN_ID_IN_MESSAGE", 64),
# CommitMentionedInAdv("COMMIT_IN_ADVISORY", 64),
CrossReferencedBug("XREF_BUG", 32),
Expand All @@ -429,23 +433,4 @@ def apply(self, candidate: Commit, advisory_record: AdvisoryRecord):
CommitHasTwins("COMMIT_HAS_TWINS", 2),
]

rules_list = [
"COMMIT_IN_REFERENCE",
"VULN_ID_IN_MESSAGE",
"VULN_ID_IN_LINKED_ISSUE",
"XREF_BUG",
"XREF_GH",
"CHANGES_RELEVANT_FILES",
"CHANGES_RELEVANT_CODE",
"RELEVANT_WORDS_IN_MESSAGE",
"ADV_KEYWORDS_IN_FILES",
"ADV_KEYWORDS_IN_MSG",
"SEC_KEYWORDS_IN_MESSAGE",
"SEC_KEYWORDS_IN_LINKED_GH",
"SEC_KEYWORDS_IN_LINKED_BUG",
"GITHUB_ISSUE_IN_MESSAGE",
"BUG_IN_MESSAGE",
"COMMIT_HAS_TWINS",
]

# print(" & ".join([f"\\rot{{{x}}}" for x in rules_list]))
RULES_PHASE_2: List[Rule] = []
Loading

0 comments on commit eb8e552

Please sign in to comment.