From 877134e86c1a503dbe5982128e935ba2f6d158ef Mon Sep 17 00:00:00 2001 From: Mike Hunhoff Date: Tue, 6 Aug 2024 16:24:15 -0600 Subject: [PATCH] binexport: init refactor for multi-arch instruction feature parsing --- .../extractors/binexport2/arch/__init__.py | 0 .../binexport2/arch/arm/__init__.py | 0 .../extractors/binexport2/arch/arm/insn.py | 131 ++++++++++ .../binexport2/arch/intel/__init__.py | 0 .../binexport2/arch/intel/helpers.py | 135 ++++++++++ .../extractors/binexport2/arch/intel/insn.py | 214 ++++++++++++++++ .../features/extractors/binexport2/helpers.py | 146 +++-------- capa/features/extractors/binexport2/insn.py | 237 +++--------------- tests/test_binexport_features.py | 77 ++---- 9 files changed, 567 insertions(+), 373 deletions(-) create mode 100644 capa/features/extractors/binexport2/arch/__init__.py create mode 100644 capa/features/extractors/binexport2/arch/arm/__init__.py create mode 100644 capa/features/extractors/binexport2/arch/arm/insn.py create mode 100644 capa/features/extractors/binexport2/arch/intel/__init__.py create mode 100644 capa/features/extractors/binexport2/arch/intel/helpers.py create mode 100644 capa/features/extractors/binexport2/arch/intel/insn.py diff --git a/capa/features/extractors/binexport2/arch/__init__.py b/capa/features/extractors/binexport2/arch/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/capa/features/extractors/binexport2/arch/arm/__init__.py b/capa/features/extractors/binexport2/arch/arm/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/capa/features/extractors/binexport2/arch/arm/insn.py b/capa/features/extractors/binexport2/arch/arm/insn.py new file mode 100644 index 000000000..3fdc6fb9f --- /dev/null +++ b/capa/features/extractors/binexport2/arch/arm/insn.py @@ -0,0 +1,131 @@ +# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import logging +from typing import List, Tuple, Iterator, Optional + +import capa.features.extractors.binexport2.helpers +from capa.features.insn import Number, Offset, OperandNumber, OperandOffset +from capa.features.common import Feature, Characteristic +from capa.features.address import Address +from capa.features.extractors.binexport2 import FunctionContext, InstructionContext +from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle +from capa.features.extractors.binexport2.helpers import ( + mask_immediate, + is_address_mapped, + get_operand_expressions, + get_operand_immediate_expression, +) +from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 + +logger = logging.getLogger(__name__) + + +def extract_insn_number_features( + fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + fhi: FunctionContext = fh.inner + ii: InstructionContext = ih.inner + + be2: BinExport2 = fhi.ctx.be2 + + instruction_index: int = ii.instruction_index + instruction: BinExport2.Instruction = be2.instruction[instruction_index] + + if len(instruction.operand_index) == 0: + # skip things like: + # .text:0040116e leave + return + + for i, operand_index in enumerate(instruction.operand_index): + operand: BinExport2.Operand = be2.operand[operand_index] + + immediate_expression: Optional[BinExport2.Expression] = get_operand_immediate_expression(be2, operand) + if not immediate_expression: + continue + + value: int = mask_immediate(fhi.arch, immediate_expression.immediate) + if is_address_mapped(be2, value): + continue + + yield Number(value), ih.address + yield OperandNumber(i, value), ih.address + + +def extract_insn_offset_features( + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + fhi: FunctionContext = fh.inner + ii: InstructionContext = ih.inner + + be2: BinExport2 = fhi.ctx.be2 + instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index] + + if len(instruction.operand_index) == 0: + # skip things like: + # .text:0040116e leave + return + + mnemonic: str = be2.mnemonic[instruction.mnemonic_index].name.lower() + + for i, operand_index in enumerate(instruction.operand_index): + operand: BinExport2.Operand = be2.operand[operand_index] + + is_dereference = False + for expression_index in operand.expression_index: + if be2.expression[expression_index].type == BinExport2.Expression.DEREFERENCE: + is_dereference = True + break + + if not is_dereference: + continue + + if mnemonic == "ldp": + # like: + # 0013a2f0 ldp x22,x9,[x21, #0x18] + expressions: List[BinExport2.Expression] = get_operand_expressions(be2, operand) + if len(expressions) <= 2: + continue + + if expressions[1].symbol.lower().endswith("sp"): + continue + + value = mask_immediate(fhi.arch, expressions[-1].immediate) + + if not is_address_mapped(be2, value): + value = capa.features.extractors.binexport2.helpers.twos_complement(fhi.arch, value) + + yield Offset(value), ih.address + yield OperandOffset(i, value), ih.address + + +def extract_insn_nzxor_characteristic_features( + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + fhi: FunctionContext = fh.inner + ii: InstructionContext = ih.inner + + be2: BinExport2 = fhi.ctx.be2 + + instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index] + mnemonic: str = be2.mnemonic[instruction.mnemonic_index].name.lower() + + if mnemonic != "eor": + return + + operands: List[BinExport2.Operand] = [be2.operand[operand_index] for operand_index in instruction.operand_index] + + assert len(operands) == 3 + + if operands[1] != operands[2]: + yield Characteristic("nzxor"), ih.address + + +def extract_function_indirect_call_characteristic_features( + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + yield from () diff --git a/capa/features/extractors/binexport2/arch/intel/__init__.py b/capa/features/extractors/binexport2/arch/intel/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/capa/features/extractors/binexport2/arch/intel/helpers.py b/capa/features/extractors/binexport2/arch/intel/helpers.py new file mode 100644 index 000000000..3696c0d93 --- /dev/null +++ b/capa/features/extractors/binexport2/arch/intel/helpers.py @@ -0,0 +1,135 @@ +# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +from typing import List, Optional +from dataclasses import dataclass + +from capa.features.extractors.binexport2.helpers import get_operand_expressions +from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 + +# security cookie checks may perform non-zeroing XORs, these are expected within a certain +# byte range within the first and returning basic blocks, this helps to reduce FP features +SECURITY_COOKIE_BYTES_DELTA: int = 0x40 + + +@dataclass +class OperandPhraseInfo: + scale: Optional[BinExport2.Expression] = None + index: Optional[BinExport2.Expression] = None + base: Optional[BinExport2.Expression] = None + displacement: Optional[BinExport2.Expression] = None + + +def get_operand_phrase_info(be2: BinExport2, operand: BinExport2.Operand) -> Optional[OperandPhraseInfo]: + # assume the following (see https://blog.yossarian.net/2020/06/13/How-x86_64-addresses-memory): + # + # Scale: A 2-bit constant factor + # Index: Any general purpose register + # Base: Any general purpose register + # Displacement: An integral offset + + expressions: List[BinExport2.Expression] = get_operand_expressions(be2, operand) + + # skip expression up to and including BinExport2.Expression.DEREFERENCE, assume caller + # has checked for BinExport2.Expression.DEREFERENCE + for i, expression in enumerate(expressions): + if expression.type == BinExport2.Expression.DEREFERENCE: + expressions = expressions[i + 1 :] + break + + expression0: BinExport2.Expression + expression1: BinExport2.Expression + expression2: BinExport2.Expression + expression3: BinExport2.Expression + expression4: BinExport2.Expression + + if len(expressions) == 1: + expression0 = expressions[0] + + assert ( + expression0.type == BinExport2.Expression.IMMEDIATE_INT + or expression0.type == BinExport2.Expression.REGISTER + ) + + if expression0.type == BinExport2.Expression.IMMEDIATE_INT: + # Displacement + return OperandPhraseInfo(displacement=expression0) + elif expression0.type == BinExport2.Expression.REGISTER: + # Base + return OperandPhraseInfo(base=expression0) + + elif len(expressions) == 3: + expression0 = expressions[0] + expression1 = expressions[1] + expression2 = expressions[2] + + assert expression0.type == BinExport2.Expression.REGISTER + assert expression1.type == BinExport2.Expression.OPERATOR + assert ( + expression2.type == BinExport2.Expression.IMMEDIATE_INT + or expression2.type == BinExport2.Expression.REGISTER + ) + + if expression2.type == BinExport2.Expression.REGISTER: + # Base + Index + return OperandPhraseInfo(base=expression0, index=expression2) + elif expression2.type == BinExport2.Expression.IMMEDIATE_INT: + # Base + Displacement + return OperandPhraseInfo(base=expression0, displacement=expression2) + + elif len(expressions) == 5: + expression0 = expressions[0] + expression1 = expressions[1] + expression2 = expressions[2] + expression3 = expressions[3] + expression4 = expressions[4] + + assert expression0.type == BinExport2.Expression.REGISTER + assert expression1.type == BinExport2.Expression.OPERATOR + assert ( + expression2.type == BinExport2.Expression.REGISTER + or expression2.type == BinExport2.Expression.IMMEDIATE_INT + ) + assert expression3.type == BinExport2.Expression.OPERATOR + assert expression4.type == BinExport2.Expression.IMMEDIATE_INT + + if expression1.symbol == "+" and expression3.symbol == "+": + # Base + Index + Displacement + return OperandPhraseInfo(base=expression0, index=expression2, displacement=expression4) + elif expression1.symbol == "+" and expression3.symbol == "*": + # Base + (Index * Scale) + return OperandPhraseInfo(base=expression0, index=expression2, scale=expression3) + elif expression1.symbol == "*" and expression3.symbol == "+": + # (Index * Scale) + Displacement + return OperandPhraseInfo(index=expression0, scale=expression2, displacement=expression3) + else: + raise NotImplementedError(expression1.symbol, expression3.symbol) + + elif len(expressions) == 7: + expression0 = expressions[0] + expression1 = expressions[1] + expression2 = expressions[2] + expression3 = expressions[3] + expression4 = expressions[4] + expression5 = expressions[5] + expression6 = expressions[6] + + assert expression0.type == BinExport2.Expression.REGISTER + assert expression1.type == BinExport2.Expression.OPERATOR + assert expression2.type == BinExport2.Expression.REGISTER + assert expression3.type == BinExport2.Expression.OPERATOR + assert expression4.type == BinExport2.Expression.IMMEDIATE_INT + assert expression5.type == BinExport2.Expression.OPERATOR + assert expression6.type == BinExport2.Expression.IMMEDIATE_INT + + # Base + (Index * Scale) + Displacement + return OperandPhraseInfo(base=expression0, index=expression2, scale=expression4, displacement=expression6) + + else: + raise NotImplementedError(len(expressions)) + + return None diff --git a/capa/features/extractors/binexport2/arch/intel/insn.py b/capa/features/extractors/binexport2/arch/intel/insn.py new file mode 100644 index 000000000..9836ef1f7 --- /dev/null +++ b/capa/features/extractors/binexport2/arch/intel/insn.py @@ -0,0 +1,214 @@ +# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import logging +from typing import List, Tuple, Iterator, Optional + +import capa.features.extractors.strings +import capa.features.extractors.binexport2.helpers +from capa.features.insn import MAX_STRUCTURE_SIZE, Number, Offset, OperandNumber, OperandOffset +from capa.features.common import Feature, Characteristic +from capa.features.address import Address +from capa.features.extractors.binexport2 import FunctionContext, BasicBlockContext, InstructionContext +from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle +from capa.features.extractors.binexport2.helpers import ( + mask_immediate, + is_address_mapped, + get_operand_register_expression, + get_operand_immediate_expression, +) +from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 +from capa.features.extractors.binexport2.arch.intel.helpers import ( + SECURITY_COOKIE_BYTES_DELTA, + OperandPhraseInfo, + get_operand_phrase_info, +) + +logger = logging.getLogger(__name__) + + +def extract_insn_number_features( + fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + fhi: FunctionContext = fh.inner + ii: InstructionContext = ih.inner + + be2: BinExport2 = fhi.ctx.be2 + + instruction_index: int = ii.instruction_index + instruction: BinExport2.Instruction = be2.instruction[instruction_index] + + if len(instruction.operand_index) == 0: + # skip things like: + # .text:0040116e leave + return + + mnemonic: str = be2.mnemonic[instruction.mnemonic_index].name.lower() + + if mnemonic.startswith("ret"): + # skip things like: + # .text:0042250E retn 8 + return + + if mnemonic.startswith(("add", "sub")): + register_expression: Optional[BinExport2.Expression] = get_operand_register_expression( + be2, be2.operand[instruction.operand_index[0]] + ) + if register_expression and register_expression.symbol.lower().endswith(("sp", "bp")): + # skip things like: + # 0x415bbc ADD ESP, 0xC + return + + for i, operand_index in enumerate(instruction.operand_index): + operand: BinExport2.Operand = be2.operand[operand_index] + + immediate_expression: Optional[BinExport2.Expression] = get_operand_immediate_expression(be2, operand) + if not immediate_expression: + continue + + value: int = mask_immediate(fhi.arch, immediate_expression.immediate) + if is_address_mapped(be2, value): + continue + + yield Number(value), ih.address + yield OperandNumber(i, value), ih.address + + if mnemonic.startswith("add"): + if 0 < value < MAX_STRUCTURE_SIZE: + yield Offset(value), ih.address + yield OperandOffset(i, value), ih.address + + +def extract_insn_offset_features( + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + fhi: FunctionContext = fh.inner + ii: InstructionContext = ih.inner + + be2: BinExport2 = fhi.ctx.be2 + instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index] + + if len(instruction.operand_index) == 0: + # skip things like: + # .text:0040116e leave + return + + mnemonic: str = be2.mnemonic[instruction.mnemonic_index].name.lower() + value: int + + for i, operand_index in enumerate(instruction.operand_index): + operand: BinExport2.Operand = be2.operand[operand_index] + + is_dereference = False + for expression_index in operand.expression_index: + if be2.expression[expression_index].type == BinExport2.Expression.DEREFERENCE: + is_dereference = True + break + + if not is_dereference: + continue + + phrase_info: Optional[OperandPhraseInfo] = get_operand_phrase_info(be2, operand) + if not phrase_info: + continue + + if phrase_info.displacement: + if phrase_info.base and phrase_info.base.symbol.lower().endswith(("bp", "sp")): + # skips things like: + # 00401068 MOV dword ptr [EBP + local_8],EAX + continue + + value = mask_immediate(fhi.arch, phrase_info.displacement.immediate) + if not is_address_mapped(be2, value): + value = capa.features.extractors.binexport2.helpers.twos_complement(fhi.arch, value, 32) + + yield Offset(value), ih.address + yield OperandOffset(i, value), ih.address + + if mnemonic == "lea" and i == 1: + if phrase_info.base and not any((phrase_info.scale, phrase_info.index)): + yield Number(value), ih.address + yield OperandNumber(i, value), ih.address + + elif phrase_info.base and not any((phrase_info.index, phrase_info.scale)): + # like: + # 00401062 MOVZX EAX,word ptr [EDI] + yield Offset(0), ih.address + yield OperandOffset(i, 0), ih.address + + +def is_security_cookie( + fhi: FunctionContext, + bbi: BasicBlockContext, + instruction: BinExport2.Instruction, +) -> bool: + """ + check if an instruction is related to security cookie checks. + """ + be2: BinExport2 = fhi.ctx.be2 + + # security cookie check should use SP or BP + op1: BinExport2.Operand = be2.operand[instruction.operand_index[1]] + op1_exprs: List[BinExport2.Expression] = [be2.expression[expr_i] for expr_i in op1.expression_index] + if all(expr.symbol.lower() not in ("bp", "esp", "ebp", "rbp", "rsp") for expr in op1_exprs): + return False + + # check_nzxor_security_cookie_delta + # if insn falls at the start of first entry block of the parent function. + flow_graph: BinExport2.FlowGraph = be2.flow_graph[fhi.flow_graph_index] + basic_block_index: int = bbi.basic_block_index + bb: BinExport2.BasicBlock = be2.basic_block[basic_block_index] + if flow_graph.entry_basic_block_index == basic_block_index: + first_addr: int = min((be2.instruction[ir.begin_index].address for ir in bb.instruction_index)) + if instruction.address < first_addr + SECURITY_COOKIE_BYTES_DELTA: + return True + # or insn falls at the end before return in a terminal basic block. + if basic_block_index not in (e.source_basic_block_index for e in flow_graph.edge): + last_addr: int = max((be2.instruction[ir.end_index - 1].address for ir in bb.instruction_index)) + if instruction.address > last_addr - SECURITY_COOKIE_BYTES_DELTA: + return True + return False + + +def extract_insn_nzxor_characteristic_features( + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + """ + parse non-zeroing XOR instruction from the given instruction. + ignore expected non-zeroing XORs, e.g. security cookies. + """ + fhi: FunctionContext = fh.inner + ii: InstructionContext = ih.inner + + be2: BinExport2 = fhi.ctx.be2 + + instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index] + mnemonic: BinExport2.Mnemonic = be2.mnemonic[instruction.mnemonic_index] + mnemonic_name: str = mnemonic.name.lower() + if mnemonic_name not in ( + "xor", + "xorpd", + "xorps", + "pxor", + ): + return + + operands: List[BinExport2.Operand] = [be2.operand[operand_index] for operand_index in instruction.operand_index] + + if mnemonic_name in ("xor", "xorpd", "xorps", "pxor"): + if operands[0] == operands[1]: + return + if is_security_cookie(fhi, bbh.inner, instruction): + return + + yield Characteristic("nzxor"), ih.address + + +def extract_function_indirect_call_characteristic_features( + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + yield from () diff --git a/capa/features/extractors/binexport2/helpers.py b/capa/features/extractors/binexport2/helpers.py index 28f95cac6..4e2ef07e0 100644 --- a/capa/features/extractors/binexport2/helpers.py +++ b/capa/features/extractors/binexport2/helpers.py @@ -5,133 +5,45 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. -from typing import List, Optional -from dataclasses import dataclass +from typing import Set, List, Iterator, Optional +import capa.features.extractors.helpers +from capa.features.common import ARCH_I386, ARCH_AMD64, ARCH_AARCH64 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 +HAS_ARCH32 = {ARCH_I386} +HAS_ARCH64 = {ARCH_AARCH64, ARCH_AMD64} -@dataclass -class OperandPhraseInfo: - scale: Optional[BinExport2.Expression] = None - index: Optional[BinExport2.Expression] = None - base: Optional[BinExport2.Expression] = None - displacement: Optional[BinExport2.Expression] = None +HAS_ARCH_INTEL = {ARCH_I386, ARCH_AMD64} +HAS_ARCH_ARM = {ARCH_AARCH64} -def is_vertex_type(vertex: BinExport2.CallGraph.Vertex, type_: BinExport2.CallGraph.Vertex.Type.ValueType) -> bool: - return vertex.HasField("type") and vertex.type == type_ +def mask_immediate(arch: Set[str], immediate: int) -> int: + if arch & HAS_ARCH64: + immediate &= 0xFFFFFFFFFFFFFFFF + elif arch & HAS_ARCH32: + immediate &= 0xFFFFFFFF + return immediate -def get_operand_phrase_info(be2: BinExport2, operand: BinExport2.Operand) -> Optional[OperandPhraseInfo]: - # assume the following (see https://blog.yossarian.net/2020/06/13/How-x86_64-addresses-memory): - # - # Scale: A 2-bit constant factor - # Index: Any general purpose register - # Base: Any general purpose register - # Displacement: An integral offset - - expressions: List[BinExport2.Expression] = get_operand_expressions(be2, operand) - - # skip expression up to and including BinExport2.Expression.DEREFERENCE, assume caller - # has checked for BinExport2.Expression.DEREFERENCE - for i, expression in enumerate(expressions): - if expression.type == BinExport2.Expression.DEREFERENCE: - expressions = expressions[i + 1 :] - break - - expression0: BinExport2.Expression - expression1: BinExport2.Expression - expression2: BinExport2.Expression - expression3: BinExport2.Expression - expression4: BinExport2.Expression - - if len(expressions) == 1: - expression0 = expressions[0] - - assert ( - expression0.type == BinExport2.Expression.IMMEDIATE_INT - or expression0.type == BinExport2.Expression.REGISTER - ) - - if expression0.type == BinExport2.Expression.IMMEDIATE_INT: - # Displacement - return OperandPhraseInfo(displacement=expression0) - elif expression0.type == BinExport2.Expression.REGISTER: - # Base - return OperandPhraseInfo(base=expression0) - - elif len(expressions) == 3: - expression0 = expressions[0] - expression1 = expressions[1] - expression2 = expressions[2] - - assert expression0.type == BinExport2.Expression.REGISTER - assert expression1.type == BinExport2.Expression.OPERATOR - assert ( - expression2.type == BinExport2.Expression.IMMEDIATE_INT - or expression2.type == BinExport2.Expression.REGISTER - ) - - if expression2.type == BinExport2.Expression.REGISTER: - # Base + Index - return OperandPhraseInfo(base=expression0, index=expression2) - elif expression2.type == BinExport2.Expression.IMMEDIATE_INT: - # Base + Displacement - return OperandPhraseInfo(base=expression0, displacement=expression2) - - elif len(expressions) == 5: - expression0 = expressions[0] - expression1 = expressions[1] - expression2 = expressions[2] - expression3 = expressions[3] - expression4 = expressions[4] - - assert expression0.type == BinExport2.Expression.REGISTER - assert expression1.type == BinExport2.Expression.OPERATOR - assert ( - expression2.type == BinExport2.Expression.REGISTER - or expression2.type == BinExport2.Expression.IMMEDIATE_INT - ) - assert expression3.type == BinExport2.Expression.OPERATOR - assert expression4.type == BinExport2.Expression.IMMEDIATE_INT - - if expression1.symbol == "+" and expression3.symbol == "+": - # Base + Index + Displacement - return OperandPhraseInfo(base=expression0, index=expression2, displacement=expression4) - elif expression1.symbol == "+" and expression3.symbol == "*": - # Base + (Index * Scale) - return OperandPhraseInfo(base=expression0, index=expression2, scale=expression3) - elif expression1.symbol == "*" and expression3.symbol == "+": - # (Index * Scale) + Displacement - return OperandPhraseInfo(index=expression0, scale=expression2, displacement=expression3) - else: - raise NotImplementedError(expression1.symbol, expression3.symbol) - - elif len(expressions) == 7: - expression0 = expressions[0] - expression1 = expressions[1] - expression2 = expressions[2] - expression3 = expressions[3] - expression4 = expressions[4] - expression5 = expressions[5] - expression6 = expressions[6] - - assert expression0.type == BinExport2.Expression.REGISTER - assert expression1.type == BinExport2.Expression.OPERATOR - assert expression2.type == BinExport2.Expression.REGISTER - assert expression3.type == BinExport2.Expression.OPERATOR - assert expression4.type == BinExport2.Expression.IMMEDIATE_INT - assert expression5.type == BinExport2.Expression.OPERATOR - assert expression6.type == BinExport2.Expression.IMMEDIATE_INT - - # Base + (Index * Scale) + Displacement - return OperandPhraseInfo(base=expression0, index=expression2, scale=expression4, displacement=expression6) +def twos_complement(arch: Set[str], immediate: int, default: Optional[int] = None) -> int: + if default is not None: + return capa.features.extractors.helpers.twos_complement(immediate, default) + elif arch & HAS_ARCH64: + return capa.features.extractors.helpers.twos_complement(immediate, 64) + elif arch & HAS_ARCH32: + return capa.features.extractors.helpers.twos_complement(immediate, 32) + return immediate - else: - raise NotImplementedError(len(expressions)) - return None +def is_address_mapped(be2: BinExport2, address: int) -> bool: + """return True if the given address is mapped""" + sections_with_perms: Iterator[BinExport2.Section] = filter(lambda s: s.flag_r or s.flag_w or s.flag_x, be2.section) + return any(section.address <= address < section.address + section.size for section in sections_with_perms) + + +def is_vertex_type(vertex: BinExport2.CallGraph.Vertex, type_: BinExport2.CallGraph.Vertex.Type.ValueType) -> bool: + return vertex.HasField("type") and vertex.type == type_ def _get_operand_expression_list( diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py index 50488e146..eab2182f8 100644 --- a/capa/features/extractors/binexport2/insn.py +++ b/capa/features/extractors/binexport2/insn.py @@ -6,13 +6,15 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. import logging -from typing import List, Tuple, Iterator, Optional +from typing import List, Tuple, Iterator import capa.features.extractors.helpers import capa.features.extractors.strings import capa.features.extractors.binexport2.helpers -from capa.features.insn import API, MAX_STRUCTURE_SIZE, Number, Offset, Mnemonic, OperandNumber, OperandOffset -from capa.features.common import ARCH_I386, ARCH_AMD64, ARCH_AARCH64, Bytes, String, Feature, Characteristic +import capa.features.extractors.binexport2.arch.arm.insn +import capa.features.extractors.binexport2.arch.intel.insn +from capa.features.insn import API, Mnemonic +from capa.features.common import Bytes, String, Feature, Characteristic from capa.features.address import Address, AbsoluteVirtualAddress from capa.features.extractors.binexport2 import ( AddressSpace, @@ -20,40 +22,15 @@ BinExport2Index, FunctionContext, ReadMemoryError, - BasicBlockContext, BinExport2Analysis, InstructionContext, ) from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle -from capa.features.extractors.binexport2.helpers import ( - OperandPhraseInfo, - get_operand_phrase_info, - get_operand_register_expression, - get_operand_immediate_expression, -) +from capa.features.extractors.binexport2.helpers import HAS_ARCH_ARM, HAS_ARCH_INTEL from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 logger = logging.getLogger(__name__) -# security cookie checks may perform non-zeroing XORs, these are expected within a certain -# byte range within the first and returning basic blocks, this helps to reduce FP features -SECURITY_COOKIE_BYTES_DELTA: int = 0x40 - - -HAS_ARCH32 = {ARCH_I386} -HAS_ARCH64 = {ARCH_AARCH64, ARCH_AMD64} - -HAS_ARCH_INTEL = {ARCH_I386, ARCH_AMD64} -HAS_ARCH_ARM = {ARCH_AARCH64} - - -def mask_immediate(fhi: FunctionContext, immediate: int) -> int: - if fhi.arch & HAS_ARCH64: - immediate &= 0xFFFFFFFFFFFFFFFF - elif fhi.arch & HAS_ARCH32: - immediate &= 0xFFFFFFFF - return immediate - def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]: fhi: FunctionContext = fh.inner @@ -103,65 +80,15 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle """ -def is_address_mapped(be2: BinExport2, address: int) -> bool: - """return True if the given address is mapped""" - sections_with_perms: Iterator[BinExport2.Section] = filter(lambda s: s.flag_r or s.flag_w or s.flag_x, be2.section) - return any(section.address <= address < section.address + section.size for section in sections_with_perms) - - def extract_insn_number_features( - fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle ) -> Iterator[Tuple[Feature, Address]]: fhi: FunctionContext = fh.inner - ii: InstructionContext = ih.inner - - be2: BinExport2 = fhi.ctx.be2 - - instruction_index: int = ii.instruction_index - instruction: BinExport2.Instruction = be2.instruction[instruction_index] - - if len(instruction.operand_index) == 0: - # skip things like: - # .text:0040116e leave - return - - mnemonic: str = be2.mnemonic[instruction.mnemonic_index].name.lower() if fhi.arch & HAS_ARCH_INTEL: - # short-circut checks for intel architecture - if mnemonic.startswith("ret"): - # skip things like: - # .text:0042250E retn 8 - return - - if mnemonic.startswith(("add", "sub")): - register_expression: Optional[BinExport2.Expression] = get_operand_register_expression( - be2, be2.operand[instruction.operand_index[0]] - ) - if register_expression and register_expression.symbol.lower().endswith(("sp", "bp")): - # skip things like: - # 0x415bbc ADD ESP, 0xC - return - - for i, operand_index in enumerate(instruction.operand_index): - operand: BinExport2.Operand = be2.operand[operand_index] - - immediate_expression: Optional[BinExport2.Expression] = get_operand_immediate_expression(be2, operand) - if not immediate_expression: - continue - - value: int = mask_immediate(fhi, immediate_expression.immediate) - if is_address_mapped(be2, value): - continue - - yield Number(value), ih.address - yield OperandNumber(i, value), ih.address - - if fhi.arch & HAS_ARCH_INTEL: - if mnemonic.startswith("add"): - if 0 < value < MAX_STRUCTURE_SIZE: - yield Offset(value), ih.address - yield OperandOffset(i, value), ih.address + yield from capa.features.extractors.binexport2.arch.intel.insn.extract_insn_number_features(fh, bbh, ih) + elif fhi.arch & HAS_ARCH_ARM: + yield from capa.features.extractors.binexport2.arch.arm.insn.extract_insn_number_features(fh, bbh, ih) def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]: @@ -246,133 +173,26 @@ def extract_insn_offset_features( fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle ) -> Iterator[Tuple[Feature, Address]]: fhi: FunctionContext = fh.inner - ii: InstructionContext = ih.inner - - be2: BinExport2 = fhi.ctx.be2 - instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index] - - if len(instruction.operand_index) == 0: - # skip things like: - # .text:0040116e leave - return - - mnemonic: str = be2.mnemonic[instruction.mnemonic_index].name.lower() - - for i, operand_index in enumerate(instruction.operand_index): - operand: BinExport2.Operand = be2.operand[operand_index] - - is_dereference = False - for expression_index in operand.expression_index: - if be2.expression[expression_index].type == BinExport2.Expression.DEREFERENCE: - is_dereference = True - break - - if not is_dereference: - continue - if fhi.arch & HAS_ARCH_INTEL: - phrase_info: Optional[OperandPhraseInfo] = get_operand_phrase_info(be2, operand) - if not phrase_info: - continue - - if phrase_info.displacement: - if phrase_info.base and phrase_info.base.symbol.lower().endswith(("bp", "sp")): - # skips things like: - # 00401068 MOV dword ptr [EBP + local_8],EAX - continue - - value: int = mask_immediate(fhi, phrase_info.displacement.immediate) - if not is_address_mapped(be2, value): - value = capa.features.extractors.helpers.twos_complement(value, 32) - - yield Offset(value), ih.address - yield OperandOffset(i, value), ih.address - - if mnemonic == "lea" and i == 1: - if phrase_info.base and not any((phrase_info.scale, phrase_info.index)): - yield Number(value), ih.address - yield OperandNumber(i, value), ih.address - - elif phrase_info.base and not any((phrase_info.index, phrase_info.scale)): - # like: - # 00401062 MOVZX EAX,word ptr [EDI] - yield Offset(0), ih.address - yield OperandOffset(i, 0), ih.address - - -def is_security_cookie( - fhi: FunctionContext, - bbi: BasicBlockContext, - instruction: BinExport2.Instruction, -) -> bool: - """ - check if an instruction is related to security cookie checks. - """ - be2: BinExport2 = fhi.ctx.be2 - - # security cookie check should use SP or BP - op1: BinExport2.Operand = be2.operand[instruction.operand_index[1]] - op1_exprs: List[BinExport2.Expression] = [be2.expression[expr_i] for expr_i in op1.expression_index] - if all(expr.symbol.lower() not in ("bp", "esp", "ebp", "rbp", "rsp") for expr in op1_exprs): - return False - - # check_nzxor_security_cookie_delta - # if insn falls at the start of first entry block of the parent function. - flow_graph: BinExport2.FlowGraph = be2.flow_graph[fhi.flow_graph_index] - basic_block_index: int = bbi.basic_block_index - bb: BinExport2.BasicBlock = be2.basic_block[basic_block_index] - if flow_graph.entry_basic_block_index == basic_block_index: - first_addr: int = min((be2.instruction[ir.begin_index].address for ir in bb.instruction_index)) - if instruction.address < first_addr + SECURITY_COOKIE_BYTES_DELTA: - return True - # or insn falls at the end before return in a terminal basic block. - if basic_block_index not in (e.source_basic_block_index for e in flow_graph.edge): - last_addr: int = max((be2.instruction[ir.end_index - 1].address for ir in bb.instruction_index)) - if instruction.address > last_addr - SECURITY_COOKIE_BYTES_DELTA: - return True - return False + if fhi.arch & HAS_ARCH_INTEL: + yield from capa.features.extractors.binexport2.arch.intel.insn.extract_insn_offset_features(fh, bbh, ih) + elif fhi.arch & HAS_ARCH_ARM: + yield from capa.features.extractors.binexport2.arch.arm.insn.extract_insn_offset_features(fh, bbh, ih) def extract_insn_nzxor_characteristic_features( fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle ) -> Iterator[Tuple[Feature, Address]]: - """ - parse non-zeroing XOR instruction from the given instruction. - ignore expected non-zeroing XORs, e.g. security cookies. - """ fhi: FunctionContext = fh.inner - ii: InstructionContext = ih.inner - be2: BinExport2 = fhi.ctx.be2 - - instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index] - mnemonic: BinExport2.Mnemonic = be2.mnemonic[instruction.mnemonic_index] - mnemonic_name: str = mnemonic.name.lower() - if mnemonic_name not in ( - "xor", - "xorpd", - "xorps", - "pxor", # x86 / amd64 - "eor", # arm / aarch64 - ): - return - - operands: List[BinExport2.Operand] = [be2.operand[operand_index] for operand_index in instruction.operand_index] - - # check whether operands are same for x86 / amd64 - if mnemonic_name in ("xor", "xorpd", "xorps", "pxor"): - if operands[0] == operands[1]: - return - if is_security_cookie(fhi, bbh.inner, instruction): - return - - # check whether 2nd/3rd operands are same for arm / aarch64 - if mnemonic_name == "eor": - assert len(operands) == 3 - if operands[1] == operands[2]: - return - - yield Characteristic("nzxor"), ih.address + if fhi.arch & HAS_ARCH_INTEL: + yield from capa.features.extractors.binexport2.arch.intel.insn.extract_insn_nzxor_characteristic_features( + fh, bbh, ih + ) + elif fhi.arch & HAS_ARCH_ARM: + yield from capa.features.extractors.binexport2.arch.arm.insn.extract_insn_nzxor_characteristic_features( + fh, bbh, ih + ) def extract_insn_mnemonic_features( @@ -412,9 +232,16 @@ def extract_function_calls_from(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandl def extract_function_indirect_call_characteristic_features( fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle ) -> Iterator[Tuple[Feature, Address]]: - # TODO(wb): complete - # https://github.com/mandiant/capa/issues/1755 - yield from () + fhi: FunctionContext = fh.inner + + if fhi.arch & HAS_ARCH_INTEL: + yield from capa.features.extractors.binexport2.arch.intel.insn.extract_function_indirect_call_characteristic_features( + fh, bbh, ih + ) + elif fhi.arch & HAS_ARCH_ARM: + yield from capa.features.extractors.binexport2.arch.arm.insn.extract_function_indirect_call_characteristic_features( + fh, bbh, ih + ) def extract_features(f: FunctionHandle, bbh: BBHandle, insn: InsnHandle) -> Iterator[Tuple[Feature, Address]]: diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py index 1cf8686f4..872a89d11 100644 --- a/tests/test_binexport_features.py +++ b/tests/test_binexport_features.py @@ -46,7 +46,7 @@ "687e79.ghidra.be2", "file", capa.features.file.Export("android::clearDir"), - "xfail: not implemented yet?!", + "xfail: name demangling is not implemented", ), ("687e79.ghidra.be2", "file", capa.features.file.Export("nope"), False), # file/imports @@ -104,19 +104,6 @@ capa.features.common.Characteristic("stack string"), "xfail: not implemented yet", ), - # bb/characteristic(tight loop) - ( - "687e79.ghidra.be2", - "function=0x0,bb=0x0", - capa.features.common.Characteristic("tight loop"), - "xfail: not implemented yet", - ), - ( - "687e79.ghidra.be2", - "function=0x0,bb=0x0", - capa.features.common.Characteristic("tight loop"), - "xfail: not implemented yet", - ), # insn/mnemonic ("687e79.ghidra.be2", "function=0x107588", capa.features.insn.Mnemonic("stp"), True), ("687e79.ghidra.be2", "function=0x107588", capa.features.insn.Mnemonic("adrp"), True), @@ -142,24 +129,24 @@ capa.features.insn.OperandNumber(1, 0x8), True, ), - ( - "687e79.ghidra.be2", - "function=0x107588,bb=0x107588,insn=0x1075b8", - capa.features.insn.OperandNumber(3, 0x10), - "xfail: GSM?", - ), # TODO(mr): https://github.com/mandiant/capa/issues/2102 # insn/operand.offset ( "687e79.ghidra.be2", - "function=0x0,bb=0x0", - capa.features.insn.OperandOffset(1, 100), - "xfail: not implemented yet", + "function=0x105128,bb=0x105450", + capa.features.insn.OperandOffset(2, 0x10), + True, ), ( - "687e79.ghidra.be2", - "function=0x0,bb=0x0", - capa.features.insn.OperandOffset(3, 100), - "xfail: not implemented yet", + "d1e650.ghidra.be2", + "function=0x124854,bb=0x1248AC,insn=0x1248B4", + capa.features.insn.OperandOffset(2, -0x48), + True, + ), + ( + "d1e650.ghidra.be2", + "function=0x13347c,bb=0x133548,insn=0x133554", + capa.features.insn.OperandOffset(2, 0x20), + False, ), # insn/number ("687e79.ghidra.be2", "function=0x107588", capa.features.insn.Number(0x3), True), @@ -181,7 +168,7 @@ "687e79.ghidra.be2", "function=0x1057f8,bb=0x1057f8", capa.features.insn.Number(0xFFFFFFFFFFFFFFFF), - "xfail: not implemented yet", + True, ), ( "687e79.ghidra.be2", @@ -192,34 +179,22 @@ # insn/offset ( "687e79.ghidra.be2", - "function=0x0", - capa.features.insn.Offset(0x0), - "xfail: not implemented yet", - ), - ( - "687e79.ghidra.be2", - "function=0x0", - capa.features.insn.Offset(0x4), - "xfail: not implemented yet", + "function=0x105128,bb=0x105450", + capa.features.insn.Offset(0x10), + True, ), ( - "687e79.ghidra.be2", - "function=0x0", - capa.features.insn.Offset(0xC), - "xfail: not implemented yet", + "d1e650.ghidra.be2", + "function=0x13347c,bb=0x133548,insn=0x133554", + capa.features.insn.Offset(0x20), + False, ), # insn/offset: negative ( - "687e79.ghidra.be2", - "function=0x0", - capa.features.insn.Offset(-0x1), - "xfail: not implemented yet", - ), - ( - "687e79.ghidra.be2", - "function=0x0", - capa.features.insn.Offset(-0x2), - "xfail: not implemented yet", + "d1e650.ghidra.be2", + "function=0x124854,bb=0x1248AC,insn=0x1248B4", + capa.features.insn.Offset(-0x48), + True, ), # insn/offset from mnemonic: add #