Skip to content

Commit

Permalink
binexport: init refactor for multi-arch instruction feature parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
mike-hunhoff committed Aug 6, 2024
1 parent 210f127 commit 877134e
Show file tree
Hide file tree
Showing 9 changed files with 567 additions and 373 deletions.
Empty file.
Empty file.
131 changes: 131 additions & 0 deletions capa/features/extractors/binexport2/arch/arm/insn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at: [package root]/LICENSE.txt
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import logging
from typing import List, Tuple, Iterator, Optional

import capa.features.extractors.binexport2.helpers
from capa.features.insn import Number, Offset, OperandNumber, OperandOffset
from capa.features.common import Feature, Characteristic
from capa.features.address import Address
from capa.features.extractors.binexport2 import FunctionContext, InstructionContext
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
from capa.features.extractors.binexport2.helpers import (
mask_immediate,
is_address_mapped,
get_operand_expressions,
get_operand_immediate_expression,
)
from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2

logger = logging.getLogger(__name__)


def extract_insn_number_features(
fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
) -> Iterator[Tuple[Feature, Address]]:
fhi: FunctionContext = fh.inner
ii: InstructionContext = ih.inner

be2: BinExport2 = fhi.ctx.be2

instruction_index: int = ii.instruction_index
instruction: BinExport2.Instruction = be2.instruction[instruction_index]

if len(instruction.operand_index) == 0:
# skip things like:
# .text:0040116e leave
return

for i, operand_index in enumerate(instruction.operand_index):
operand: BinExport2.Operand = be2.operand[operand_index]

immediate_expression: Optional[BinExport2.Expression] = get_operand_immediate_expression(be2, operand)
if not immediate_expression:
continue

value: int = mask_immediate(fhi.arch, immediate_expression.immediate)
if is_address_mapped(be2, value):
continue

yield Number(value), ih.address
yield OperandNumber(i, value), ih.address


def extract_insn_offset_features(
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
) -> Iterator[Tuple[Feature, Address]]:
fhi: FunctionContext = fh.inner
ii: InstructionContext = ih.inner

be2: BinExport2 = fhi.ctx.be2
instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index]

if len(instruction.operand_index) == 0:
# skip things like:
# .text:0040116e leave
return

mnemonic: str = be2.mnemonic[instruction.mnemonic_index].name.lower()

for i, operand_index in enumerate(instruction.operand_index):
operand: BinExport2.Operand = be2.operand[operand_index]

is_dereference = False
for expression_index in operand.expression_index:
if be2.expression[expression_index].type == BinExport2.Expression.DEREFERENCE:
is_dereference = True
break

if not is_dereference:
continue

if mnemonic == "ldp":
# like:
# 0013a2f0 ldp x22,x9,[x21, #0x18]
expressions: List[BinExport2.Expression] = get_operand_expressions(be2, operand)
if len(expressions) <= 2:
continue

if expressions[1].symbol.lower().endswith("sp"):
continue

value = mask_immediate(fhi.arch, expressions[-1].immediate)

if not is_address_mapped(be2, value):
value = capa.features.extractors.binexport2.helpers.twos_complement(fhi.arch, value)

yield Offset(value), ih.address
yield OperandOffset(i, value), ih.address


def extract_insn_nzxor_characteristic_features(
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
) -> Iterator[Tuple[Feature, Address]]:
fhi: FunctionContext = fh.inner
ii: InstructionContext = ih.inner

be2: BinExport2 = fhi.ctx.be2

instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index]
mnemonic: str = be2.mnemonic[instruction.mnemonic_index].name.lower()

if mnemonic != "eor":
return

operands: List[BinExport2.Operand] = [be2.operand[operand_index] for operand_index in instruction.operand_index]

assert len(operands) == 3

if operands[1] != operands[2]:
yield Characteristic("nzxor"), ih.address


def extract_function_indirect_call_characteristic_features(
fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
) -> Iterator[Tuple[Feature, Address]]:
yield from ()
Empty file.
135 changes: 135 additions & 0 deletions capa/features/extractors/binexport2/arch/intel/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at: [package root]/LICENSE.txt
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
from typing import List, Optional
from dataclasses import dataclass

from capa.features.extractors.binexport2.helpers import get_operand_expressions
from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2

# security cookie checks may perform non-zeroing XORs, these are expected within a certain
# byte range within the first and returning basic blocks, this helps to reduce FP features
SECURITY_COOKIE_BYTES_DELTA: int = 0x40


@dataclass
class OperandPhraseInfo:
scale: Optional[BinExport2.Expression] = None
index: Optional[BinExport2.Expression] = None
base: Optional[BinExport2.Expression] = None
displacement: Optional[BinExport2.Expression] = None


def get_operand_phrase_info(be2: BinExport2, operand: BinExport2.Operand) -> Optional[OperandPhraseInfo]:
# assume the following (see https://blog.yossarian.net/2020/06/13/How-x86_64-addresses-memory):
#
# Scale: A 2-bit constant factor
# Index: Any general purpose register
# Base: Any general purpose register
# Displacement: An integral offset

expressions: List[BinExport2.Expression] = get_operand_expressions(be2, operand)

# skip expression up to and including BinExport2.Expression.DEREFERENCE, assume caller
# has checked for BinExport2.Expression.DEREFERENCE
for i, expression in enumerate(expressions):
if expression.type == BinExport2.Expression.DEREFERENCE:
expressions = expressions[i + 1 :]
break

expression0: BinExport2.Expression
expression1: BinExport2.Expression
expression2: BinExport2.Expression
expression3: BinExport2.Expression
expression4: BinExport2.Expression

if len(expressions) == 1:
expression0 = expressions[0]

assert (
expression0.type == BinExport2.Expression.IMMEDIATE_INT
or expression0.type == BinExport2.Expression.REGISTER
)

if expression0.type == BinExport2.Expression.IMMEDIATE_INT:
# Displacement
return OperandPhraseInfo(displacement=expression0)
elif expression0.type == BinExport2.Expression.REGISTER:
# Base
return OperandPhraseInfo(base=expression0)

elif len(expressions) == 3:
expression0 = expressions[0]
expression1 = expressions[1]
expression2 = expressions[2]

assert expression0.type == BinExport2.Expression.REGISTER
assert expression1.type == BinExport2.Expression.OPERATOR
assert (
expression2.type == BinExport2.Expression.IMMEDIATE_INT
or expression2.type == BinExport2.Expression.REGISTER
)

if expression2.type == BinExport2.Expression.REGISTER:
# Base + Index
return OperandPhraseInfo(base=expression0, index=expression2)
elif expression2.type == BinExport2.Expression.IMMEDIATE_INT:
# Base + Displacement
return OperandPhraseInfo(base=expression0, displacement=expression2)

elif len(expressions) == 5:
expression0 = expressions[0]
expression1 = expressions[1]
expression2 = expressions[2]
expression3 = expressions[3]
expression4 = expressions[4]

assert expression0.type == BinExport2.Expression.REGISTER
assert expression1.type == BinExport2.Expression.OPERATOR
assert (
expression2.type == BinExport2.Expression.REGISTER
or expression2.type == BinExport2.Expression.IMMEDIATE_INT
)
assert expression3.type == BinExport2.Expression.OPERATOR
assert expression4.type == BinExport2.Expression.IMMEDIATE_INT

if expression1.symbol == "+" and expression3.symbol == "+":
# Base + Index + Displacement
return OperandPhraseInfo(base=expression0, index=expression2, displacement=expression4)
elif expression1.symbol == "+" and expression3.symbol == "*":
# Base + (Index * Scale)
return OperandPhraseInfo(base=expression0, index=expression2, scale=expression3)
elif expression1.symbol == "*" and expression3.symbol == "+":
# (Index * Scale) + Displacement
return OperandPhraseInfo(index=expression0, scale=expression2, displacement=expression3)
else:
raise NotImplementedError(expression1.symbol, expression3.symbol)

elif len(expressions) == 7:
expression0 = expressions[0]
expression1 = expressions[1]
expression2 = expressions[2]
expression3 = expressions[3]
expression4 = expressions[4]
expression5 = expressions[5]
expression6 = expressions[6]

assert expression0.type == BinExport2.Expression.REGISTER
assert expression1.type == BinExport2.Expression.OPERATOR
assert expression2.type == BinExport2.Expression.REGISTER
assert expression3.type == BinExport2.Expression.OPERATOR
assert expression4.type == BinExport2.Expression.IMMEDIATE_INT
assert expression5.type == BinExport2.Expression.OPERATOR
assert expression6.type == BinExport2.Expression.IMMEDIATE_INT

# Base + (Index * Scale) + Displacement
return OperandPhraseInfo(base=expression0, index=expression2, scale=expression4, displacement=expression6)

else:
raise NotImplementedError(len(expressions))

return None
Loading

0 comments on commit 877134e

Please sign in to comment.