diff --git a/capa/features/extractors/binja/extractor.py b/capa/features/extractors/binja/extractor.py index 1d4dd6bd7..32e1325c6 100644 --- a/capa/features/extractors/binja/extractor.py +++ b/capa/features/extractors/binja/extractor.py @@ -5,15 +5,19 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. +import logging from typing import Iterator +from collections import defaultdict import binaryninja as binja -from binaryninja import ILException +from binaryninja import Function, BinaryView, SymbolType, ILException, RegisterValueType, LowLevelILOperation +import capa.perf import capa.features.extractors.elf import capa.features.extractors.binja.file import capa.features.extractors.binja.insn import capa.features.extractors.binja.global_ +import capa.features.extractors.binja.helpers import capa.features.extractors.binja.function import capa.features.extractors.binja.basicblock from capa.features.common import Feature @@ -26,6 +30,8 @@ StaticFeatureExtractor, ) +logger = logging.getLogger(__name__) + class BinjaFeatureExtractor(StaticFeatureExtractor): def __init__(self, bv: binja.BinaryView): @@ -36,6 +42,9 @@ def __init__(self, bv: binja.BinaryView): self.global_features.extend(capa.features.extractors.binja.global_.extract_os(self.bv)) self.global_features.extend(capa.features.extractors.binja.global_.extract_arch(self.bv)) + with capa.perf.timing("binary ninja: computing call graph"): + self._call_graph = self._build_call_graph() + def get_base_address(self): return AbsoluteVirtualAddress(self.bv.start) @@ -45,9 +54,27 @@ def extract_global_features(self): def extract_file_features(self): yield from capa.features.extractors.binja.file.extract_features(self.bv) + def _build_call_graph(self): + # from function address to function addresses + calls_from: defaultdict[int, set[int]] = defaultdict(set) + calls_to: defaultdict[int, set[int]] = defaultdict(set) + + f: Function + for f in self.bv.functions: + for caller in f.callers: + calls_from[caller.start].add(f.start) + calls_to[f.start].add(caller.start) + + call_graph = { + "calls_to": calls_to, + "calls_from": calls_from, + } + + return call_graph + def get_functions(self) -> Iterator[FunctionHandle]: for f in self.bv.functions: - yield FunctionHandle(address=AbsoluteVirtualAddress(f.start), inner=f) + yield FunctionHandle(address=AbsoluteVirtualAddress(f.start), inner=f, ctx={"call_graph": self._call_graph}) def extract_function_features(self, fh: FunctionHandle) -> Iterator[tuple[Feature, Address]]: yield from capa.features.extractors.binja.function.extract_features(fh) @@ -76,13 +103,16 @@ def extract_basic_block_features(self, fh: FunctionHandle, bbh: BBHandle) -> Ite yield from capa.features.extractors.binja.basicblock.extract_features(fh, bbh) def get_instructions(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[InsnHandle]: - import capa.features.extractors.binja.helpers as binja_helpers + f: binja.Function = fh.inner - bb: tuple[binja.BasicBlock, binja.MediumLevelILBasicBlock] = bbh.inner - addr = bb[0].start + bb: binja.BasicBlock + mlbb: binja.MediumLevelILBasicBlock + bb, mlbb = bbh.inner - for text, length in bb[0]: - insn = binja_helpers.DisassemblyInstruction(addr, length, text) + addr: int = bb.start + for text, length in bb: + llil = f.get_llils_at(addr) + insn = capa.features.extractors.binja.helpers.DisassemblyInstruction(addr, length, text, llil) yield InsnHandle(address=AbsoluteVirtualAddress(addr), inner=insn) addr += length diff --git a/capa/features/extractors/binja/function.py b/capa/features/extractors/binja/function.py index dfddfda2b..a2d284ec7 100644 --- a/capa/features/extractors/binja/function.py +++ b/capa/features/extractors/binja/function.py @@ -7,7 +7,7 @@ # See the License for the specific language governing permissions and limitations under the License. from typing import Iterator -from binaryninja import Function, BinaryView, SymbolType, ILException, RegisterValueType, LowLevelILOperation +from binaryninja import Function, BinaryView, SymbolType from capa.features.file import FunctionName from capa.features.common import Feature, Characteristic @@ -20,38 +20,24 @@ def extract_function_calls_to(fh: FunctionHandle): """extract callers to a function""" func: Function = fh.inner - for caller in func.caller_sites: - # Everything that is a code reference to the current function is considered a caller, which actually includes - # many other references that are NOT a caller. For example, an instruction `push function_start` will also be - # considered a caller to the function - llil = None - try: - # Temporary fix for https://github.com/Vector35/binaryninja-api/issues/6020. Since `.llil` can throw an - # exception rather than returning None - llil = caller.llil - except ILException: + caller: int + for caller in fh.ctx["call_graph"]["calls_to"].get(func.start, []): + if caller == func.start: continue - if (llil is None) or llil.operation not in [ - LowLevelILOperation.LLIL_CALL, - LowLevelILOperation.LLIL_CALL_STACK_ADJUST, - LowLevelILOperation.LLIL_JUMP, - LowLevelILOperation.LLIL_TAILCALL, - ]: - continue + yield Characteristic("calls to"), AbsoluteVirtualAddress(caller) - if llil.dest.value.type not in [ - RegisterValueType.ImportedAddressValue, - RegisterValueType.ConstantValue, - RegisterValueType.ConstantPointerValue, - ]: - continue - address = llil.dest.value.value - if address != func.start: +def extract_function_calls_from(fh: FunctionHandle): + """extract callers from a function""" + func: Function = fh.inner + + callee: int + for callee in fh.ctx["call_graph"]["calls_from"].get(func.start, []): + if callee == func.start: continue - yield Characteristic("calls to"), AbsoluteVirtualAddress(caller.address) + yield Characteristic("calls from"), AbsoluteVirtualAddress(callee) def extract_function_loop(fh: FunctionHandle): @@ -72,13 +58,12 @@ def extract_function_loop(fh: FunctionHandle): def extract_recursive_call(fh: FunctionHandle): """extract recursive function call""" func: Function = fh.inner - bv: BinaryView = func.view - if bv is None: - return - for ref in bv.get_code_refs(func.start): - if ref.function == func: + caller: int + for caller in fh.ctx["call_graph"]["calls_to"].get(func.start, []): + if caller == func.start: yield Characteristic("recursive call"), fh.address + return def extract_function_name(fh: FunctionHandle): @@ -108,4 +93,10 @@ def extract_features(fh: FunctionHandle) -> Iterator[tuple[Feature, Address]]: yield feature, addr -FUNCTION_HANDLERS = (extract_function_calls_to, extract_function_loop, extract_recursive_call, extract_function_name) +FUNCTION_HANDLERS = ( + extract_function_calls_to, + extract_function_calls_from, + extract_function_loop, + extract_recursive_call, + extract_function_name, +) diff --git a/capa/features/extractors/binja/helpers.py b/capa/features/extractors/binja/helpers.py index 27e8d29e0..ea60665be 100644 --- a/capa/features/extractors/binja/helpers.py +++ b/capa/features/extractors/binja/helpers.py @@ -9,7 +9,7 @@ from typing import Callable from dataclasses import dataclass -from binaryninja import BinaryView, LowLevelILInstruction +from binaryninja import BinaryView, LowLevelILOperation, LowLevelILInstruction from binaryninja.architecture import InstructionTextToken @@ -18,6 +18,24 @@ class DisassemblyInstruction: address: int length: int text: list[InstructionTextToken] + llil: list[LowLevelILInstruction] + + @property + def is_call(self): + if not self.llil: + return False + + # TODO(williballenthin): when to use one vs many llil instructions + # https://github.com/Vector35/binaryninja-api/issues/6205 + llil = self.llil[0] + if not llil: + return False + + return llil.operation in [ + LowLevelILOperation.LLIL_CALL, + LowLevelILOperation.LLIL_CALL_STACK_ADJUST, + LowLevelILOperation.LLIL_TAILCALL, + ] LLIL_VISITOR = Callable[[LowLevelILInstruction, LowLevelILInstruction, int], bool] diff --git a/capa/features/extractors/binja/insn.py b/capa/features/extractors/binja/insn.py index cae131ef9..a58239305 100644 --- a/capa/features/extractors/binja/insn.py +++ b/capa/features/extractors/binja/insn.py @@ -23,7 +23,7 @@ import capa.features.extractors.helpers from capa.features.insn import API, MAX_STRUCTURE_SIZE, Number, Offset, Mnemonic, OperandNumber, OperandOffset from capa.features.common import MAX_BYTES_FEATURE_SIZE, Bytes, String, Feature, Characteristic -from capa.features.address import Address, AbsoluteVirtualAddress +from capa.features.address import Address from capa.features.extractors.binja.helpers import DisassemblyInstruction, visit_llil_exprs from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle @@ -32,6 +32,7 @@ SECURITY_COOKIE_BYTES_DELTA = 0x40 +# TODO: move this to call graph pass # check if a function is a stub function to another function/symbol. The criteria is: # 1. The function must only have one basic block # 2. The function must only make one call/jump to another address @@ -82,8 +83,9 @@ def extract_insn_api_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) """ func: Function = fh.inner bv: BinaryView = func.view + insn: DisassemblyInstruction = ih.inner - for llil in func.get_llils_at(ih.address): + for llil in insn.llil: if llil.operation in [ LowLevelILOperation.LLIL_CALL, LowLevelILOperation.LLIL_CALL_STACK_ADJUST, @@ -138,10 +140,11 @@ def extract_insn_number_features( example: push 3136B0h ; dwControlCode """ - func: Function = fh.inner + insn: DisassemblyInstruction = ih.inner results: list[tuple[Any[Number, OperandNumber], Address]] = [] + # TODO: try to move this out of line def llil_checker(il: LowLevelILInstruction, parent: LowLevelILInstruction, index: int) -> bool: if il.operation == LowLevelILOperation.LLIL_LOAD: return False @@ -165,7 +168,7 @@ def llil_checker(il: LowLevelILInstruction, parent: LowLevelILInstruction, index return False - for llil in func.get_llils_at(ih.address): + for llil in insn.llil: visit_llil_exprs(llil, llil_checker) yield from results @@ -179,11 +182,11 @@ def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandl """ func: Function = fh.inner bv: BinaryView = func.view + insn: DisassemblyInstruction = ih.inner candidate_addrs = set() - llil = func.get_llil_at(ih.address) - if llil is None or llil.operation in [LowLevelILOperation.LLIL_CALL, LowLevelILOperation.LLIL_CALL_STACK_ADJUST]: + if insn.is_call: return for ref in bv.get_code_refs_from(ih.address): @@ -205,7 +208,7 @@ def llil_checker(il: LowLevelILInstruction, parent: LowLevelILInstruction, index return True - for llil in func.get_llils_at(ih.address): + for llil in insn.llil: visit_llil_exprs(llil, llil_checker) for addr in candidate_addrs: @@ -227,6 +230,7 @@ def extract_insn_string_features( """ func: Function = fh.inner bv: BinaryView = func.view + insn: DisassemblyInstruction = ih.inner candidate_addrs = set() @@ -250,7 +254,7 @@ def llil_checker(il: LowLevelILInstruction, parent: LowLevelILInstruction, index return True - for llil in func.get_llils_at(ih.address): + for llil in insn.llil: visit_llil_exprs(llil, llil_checker) # Now we have all the candidate address, check them for string or pointer to string @@ -283,6 +287,7 @@ def extract_insn_offset_features( .text:0040112F cmp [esi+4], ebx """ func: Function = fh.inner + insn: DisassemblyInstruction = ih.inner results: list[tuple[Any[Offset, OperandOffset], Address]] = [] address_size = func.view.arch.address_size * 8 @@ -327,7 +332,7 @@ def llil_checker(il: LowLevelILInstruction, parent: LowLevelILInstruction, index return True - for llil in func.get_llils_at(ih.address): + for llil in insn.llil: visit_llil_exprs(llil, llil_checker) yield from results @@ -367,7 +372,7 @@ def extract_insn_nzxor_characteristic_features( parse instruction non-zeroing XOR instruction ignore expected non-zeroing XORs, e.g. security cookies """ - func: Function = fh.inner + insn: DisassemblyInstruction = ih.inner results = [] @@ -383,7 +388,7 @@ def llil_checker(il: LowLevelILInstruction, parent: LowLevelILInstruction, index else: return True - for llil in func.get_llils_at(ih.address): + for llil in insn.llil: visit_llil_exprs(llil, llil_checker) yield from results @@ -415,7 +420,7 @@ def extract_insn_peb_access_characteristic_features( fs:[0x30] on x86, gs:[0x60] on x64 """ - func: Function = fh.inner + insn: DisassemblyInstruction = ih.inner results = [] @@ -445,7 +450,7 @@ def llil_checker(il: LowLevelILInstruction, parent: LowLevelILOperation, index: results.append((Characteristic("peb access"), ih.address)) return False - for llil in func.get_llils_at(ih.address): + for llil in insn.llil: visit_llil_exprs(llil, llil_checker) yield from results @@ -455,7 +460,7 @@ def extract_insn_segment_access_features( fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle ) -> Iterator[tuple[Feature, Address]]: """parse instruction fs or gs access""" - func: Function = fh.inner + insn: DisassemblyInstruction = ih.inner results = [] @@ -472,7 +477,7 @@ def llil_checker(il: LowLevelILInstruction, parent: LowLevelILInstruction, index return True - for llil in func.get_llils_at(ih.address): + for llil in insn.llil: visit_llil_exprs(llil, llil_checker) yield from results @@ -500,47 +505,6 @@ def extract_insn_cross_section_cflow( yield Characteristic("cross section flow"), ih.address -def extract_function_calls_from(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[tuple[Feature, Address]]: - """extract functions calls from features - - most relevant at the function scope, however, its most efficient to extract at the instruction scope - """ - func: Function = fh.inner - bv: BinaryView = func.view - - if bv is None: - return - - for il in func.get_llils_at(ih.address): - if il.operation not in [ - LowLevelILOperation.LLIL_CALL, - LowLevelILOperation.LLIL_CALL_STACK_ADJUST, - LowLevelILOperation.LLIL_TAILCALL, - ]: - continue - - dest = il.dest - if dest.operation == LowLevelILOperation.LLIL_CONST_PTR: - value = dest.value.value - yield Characteristic("calls from"), AbsoluteVirtualAddress(value) - elif dest.operation == LowLevelILOperation.LLIL_CONST: - yield Characteristic("calls from"), AbsoluteVirtualAddress(dest.value) - elif dest.operation == LowLevelILOperation.LLIL_LOAD: - indirect_src = dest.src - if indirect_src.operation == LowLevelILOperation.LLIL_CONST_PTR: - value = indirect_src.value.value - yield Characteristic("calls from"), AbsoluteVirtualAddress(value) - elif indirect_src.operation == LowLevelILOperation.LLIL_CONST: - yield Characteristic("calls from"), AbsoluteVirtualAddress(indirect_src.value) - elif dest.operation == LowLevelILOperation.LLIL_REG: - if dest.value.type in [ - RegisterValueType.ImportedAddressValue, - RegisterValueType.ConstantValue, - RegisterValueType.ConstantPointerValue, - ]: - yield Characteristic("calls from"), AbsoluteVirtualAddress(dest.value.value) - - def extract_function_indirect_call_characteristic_features( fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle ) -> Iterator[tuple[Feature, Address]]: @@ -550,14 +514,15 @@ def extract_function_indirect_call_characteristic_features( most relevant at the function or basic block scope; however, its most efficient to extract at the instruction scope """ - func: Function = fh.inner + insn: DisassemblyInstruction = ih.inner + + if not insn.is_call: + return - llil = func.get_llil_at(ih.address) - if llil is None or llil.operation not in [ - LowLevelILOperation.LLIL_CALL, - LowLevelILOperation.LLIL_CALL_STACK_ADJUST, - LowLevelILOperation.LLIL_TAILCALL, - ]: + # TODO(williballenthin): when to use one vs many llil instructions + # https://github.com/Vector35/binaryninja-api/issues/6205 + llil = insn.llil[0] + if not llil: return if llil.dest.operation in [LowLevelILOperation.LLIL_CONST, LowLevelILOperation.LLIL_CONST_PTR]: @@ -590,6 +555,5 @@ def extract_features(f: FunctionHandle, bbh: BBHandle, insn: InsnHandle) -> Iter extract_insn_peb_access_characteristic_features, extract_insn_cross_section_cflow, extract_insn_segment_access_features, - extract_function_calls_from, extract_function_indirect_call_characteristic_features, ) diff --git a/capa/loader.py b/capa/loader.py index 700d1a3ba..f8e82f259 100644 --- a/capa/loader.py +++ b/capa/loader.py @@ -239,6 +239,7 @@ def get_extractor( return capa.features.extractors.dnfile.extractor.DnfileFeatureExtractor(input_path) elif backend == BACKEND_BINJA: + import capa.perf as perf import capa.features.extractors.binja.find_binja_api as finder if not finder.has_binaryninja(): @@ -262,9 +263,10 @@ def get_extractor( raise UnsupportedOSError() with console.status("analyzing program...", spinner="dots"): - bv: binaryninja.BinaryView = binaryninja.load(str(input_path)) - if bv is None: - raise RuntimeError(f"Binary Ninja cannot open file {input_path}") + with perf.timing("binary ninja: loading program"): + bv: binaryninja.BinaryView = binaryninja.load(str(input_path)) + if bv is None: + raise RuntimeError(f"Binary Ninja cannot open file {input_path}") return capa.features.extractors.binja.extractor.BinjaFeatureExtractor(bv) diff --git a/capa/perf.py b/capa/perf.py index 38962222f..c22a095ff 100644 --- a/capa/perf.py +++ b/capa/perf.py @@ -5,6 +5,9 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. +import time +import inspect +import contextlib import collections # this structure is unstable and may change before the next major release. @@ -14,3 +17,20 @@ def reset(): global counters counters = collections.Counter() + + +@contextlib.contextmanager +def timing(msg: str): + """log the given message start/stop and time taken, using the caller's `logger` instance.""" + # stack: + # 0: here + # 1: contextlib + # 2: caller + caller = inspect.stack()[2] + caller_logger = caller.frame.f_globals.get("logger") + + caller_logger.debug("%s...", msg) + t0 = time.time() + yield + t1 = time.time() + caller_logger.debug("%s done in %0.1fs.", msg, t1 - t0)