Skip to content

Commit

Permalink
Only apply semgrep to codemods with results
Browse files Browse the repository at this point in the history
  • Loading branch information
drdavella committed Oct 27, 2023
1 parent 9b80e96 commit deb18c6
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 20 deletions.
26 changes: 22 additions & 4 deletions src/codemodder/codemodder.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from concurrent.futures import ThreadPoolExecutor
import datetime
import difflib
import itertools
import logging
import os
import sys
Expand All @@ -19,6 +20,7 @@
from codemodder.executor import CodemodExecutorWrapper
from codemodder.project_analysis.python_repo_manager import PythonRepoManager
from codemodder.report.codetf_reporter import report_default
from codemodder.semgrep import run as run_semgrep


def update_code(file_path, new_code):
Expand All @@ -29,6 +31,18 @@ def update_code(file_path, new_code):
f.write(new_code)


def find_semgrep_results(
context: CodemodExecutionContext,
codemods: list[CodemodExecutorWrapper],
) -> set[str]:
"""Run semgrep once with all configuration files from all codemods and return a set of applicable rule IDs"""
yaml_files = itertools.chain.from_iterable(
[codemod.yaml_files for codemod in codemods if codemod.yaml_files]
)
results = run_semgrep(context, yaml_files)
return {rule_id for file_changes in results.values() for rule_id in file_changes}


def apply_codemod_to_file(
base_directory: Path,
file_context,
Expand Down Expand Up @@ -184,7 +198,7 @@ def run(original_args) -> int:
log_list(logging.INFO, "including paths", argv.path_include)
log_list(logging.INFO, "excluding paths", argv.path_exclude)

files_to_analyze = match_files(
files_to_analyze: list[Path] = match_files(
context.directory, argv.path_exclude, argv.path_include
)
if not files_to_analyze:
Expand All @@ -195,18 +209,22 @@ def run(original_args) -> int:
logger.debug("matched files:")
log_list(logging.DEBUG, "matched files", full_names)

semgrep_results: set[str] = find_semgrep_results(context, codemods_to_run)

log_section("scanning")
# run codemods one at a time making sure to respect the given sequence
for codemod in codemods_to_run:
logger.info("running codemod %s", codemod.id)
results = codemod.apply(context)
if codemod.is_semgrep and not results:
# Unfortunately the IDs from semgrep are not fully specified
# TODO: eventually we need to be able to use fully specified IDs here
if codemod.is_semgrep and codemod.name not in semgrep_results:
logger.debug(
"no results from semgrep for %s, skipping analysis",
codemod.id,
)
continue

logger.info("running codemod %s", codemod.id)
results = codemod.apply(context)
analyze_files(
context,
files_to_analyze,
Expand Down
15 changes: 1 addition & 14 deletions src/codemodder/sarifs.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from collections import defaultdict
import json
from pathlib import Path
from typing import List, Union
from typing import Union


def extract_rule_id(result, sarif_run) -> Union[str, None]:
Expand Down Expand Up @@ -43,15 +42,3 @@ def results_by_path_and_rule_id(sarif_file):
rule_id_dict.setdefault(rule_id, []).append(r)
path_and_ruleid_dict[path].update(rule_id_dict)
return path_and_ruleid_dict


def parse_sarif_files(sarifs: List[Path]) -> defaultdict[str, defaultdict[str, List]]:
"""
Parse sarif files organize their results into a dict of dicts organized by path and id.
"""
path_id_dict: defaultdict[str, defaultdict[str, List]] = defaultdict(
lambda: defaultdict(list)
)
for path in sarifs:
path_id_dict.update(results_by_path_and_rule_id(Path(path)))
return path_id_dict
4 changes: 2 additions & 2 deletions src/codemodder/semgrep.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import subprocess
import itertools
from tempfile import NamedTemporaryFile
from typing import List
from typing import Iterable
from pathlib import Path
from codemodder.context import CodemodExecutionContext
from codemodder.sarifs import results_by_path_and_rule_id
from codemodder.logging import logger


def run(execution_context: CodemodExecutionContext, yaml_files: List[Path]) -> dict:
def run(execution_context: CodemodExecutionContext, yaml_files: Iterable[Path]) -> dict:
"""
Runs Semgrep and outputs a dict with the results organized by rule_id.
"""
Expand Down

0 comments on commit deb18c6

Please sign in to comment.