Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ghidra: improve instruction string and bytes feature extraction #1885

Merged
merged 4 commits into from
Dec 25, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions capa/features/extractors/ghidra/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,3 +275,27 @@ def dereference_ptr(insn: ghidra.program.database.code.InstructionDB):
return addr
else:
return to_deref


def find_data_references_from_insn(insn, max_depth: int = 10):
"""yield data references from given instruction"""
for reference in insn.getReferencesFrom():
if not reference.getReferenceType().isData():
# only care about data references
continue

to_addr = reference.getToAddress()

for _ in range(max_depth - 1):
data = getDataAt(to_addr) # type: ignore [name-defined] # noqa: F821
if data and data.isPointer():
ptr_value = data.getValue()

if ptr_value is None:
break

to_addr = ptr_value
else:
break

yield to_addr
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the yield only per reference intended here?

where we expect multiple references from an instruction?

Copy link
Collaborator Author

@mike-hunhoff mike-hunhoff Nov 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the yield only per reference intended here?

Yes - the intention is to follow nested pointers until data (as defined by Ghidra) is reached or max_depth is reached resulting in a single yield per memory reference from an instruction.

where we expect multiple references from an instruction?

Yes - an instruction may have multiple references from it. Here we restrict our search to memory references that reference data (as defined by Ghidra).

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I think the IDA equivalent works differently or am I confusing the functions?

What's an example of an instruction with multiple data references?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

perhaps symbolic analysis might resolve a set of candidate references for an indirect read/write like mov [eax], 1?

also perhaps ghidra supports an architecture with a complex memory to memory copy? (just guessing).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For example,

MOV dword ptr [EBP + -0x8],0x469be6
	op[0]: RefType.WRITE
	op[1]: RefType.DATA

Ghidra tracks many different reference types so to be complete we must check them all for each instruction. This level of reference tracking can be really useful.

51 changes: 14 additions & 37 deletions capa/features/extractors/ghidra/insn.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
SECURITY_COOKIE_BYTES_DELTA = 0x40


OPERAND_TYPE_DYNAMIC_ADDRESS = OperandType.DYNAMIC | OperandType.ADDRESS


def get_imports(ctx: Dict[str, Any]) -> Dict[int, Any]:
"""Populate the import cache for this context"""
if "imports_cache" not in ctx:
Expand Down Expand Up @@ -82,7 +85,7 @@ def check_for_api_call(
if not capa.features.extractors.ghidra.helpers.check_addr_for_api(addr_ref, fakes, imports, externs):
return
ref = addr_ref.getOffset()
elif ref_type == OperandType.DYNAMIC | OperandType.ADDRESS or ref_type == OperandType.DYNAMIC:
elif ref_type == OPERAND_TYPE_DYNAMIC_ADDRESS or ref_type == OperandType.DYNAMIC:
return # cannot resolve dynamics statically
else:
# pure address does not need to get dereferenced/ handled
Expand Down Expand Up @@ -214,27 +217,15 @@ def extract_insn_offset_features(fh: FunctionHandle, bb: BBHandle, ih: InsnHandl
def extract_insn_bytes_features(fh: FunctionHandle, bb: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
"""
parse referenced byte sequences

example:
push offset iid_004118d4_IShellLinkA ; riid
"""
insn: ghidra.program.database.code.InstructionDB = ih.inner

if capa.features.extractors.ghidra.helpers.is_call_or_jmp(insn):
return

ref = insn.getAddress() # init to insn addr
for i in range(insn.getNumOperands()):
if OperandType.isAddress(insn.getOperandType(i)):
ref = insn.getAddress(i) # pulls pointer if there is one

if ref != insn.getAddress(): # bail out if there's no pointer
ghidra_dat = getDataAt(ref) # type: ignore [name-defined] # noqa: F821
if (
ghidra_dat and not ghidra_dat.hasStringValue() and not ghidra_dat.isPointer()
): # avoid if the data itself is a pointer
extracted_bytes = capa.features.extractors.ghidra.helpers.get_bytes(ref, MAX_BYTES_FEATURE_SIZE)
for addr in capa.features.extractors.ghidra.helpers.find_data_references_from_insn(ih.inner):
data = getDataAt(addr) # type: ignore [name-defined] # noqa: F821
if data and not data.hasStringValue():
extracted_bytes = capa.features.extractors.ghidra.helpers.get_bytes(addr, MAX_BYTES_FEATURE_SIZE)
if extracted_bytes and not capa.features.extractors.helpers.all_zeros(extracted_bytes):
# don't extract byte features for obvious strings
yield Bytes(extracted_bytes), ih.address


Expand All @@ -245,24 +236,10 @@ def extract_insn_string_features(fh: FunctionHandle, bb: BBHandle, ih: InsnHandl
example:
push offset aAcr ; "ACR > "
"""
insn: ghidra.program.database.code.InstructionDB = ih.inner
dyn_addr = OperandType.DYNAMIC | OperandType.ADDRESS

ref = insn.getAddress()
for i in range(insn.getNumOperands()):
if OperandType.isScalarAsAddress(insn.getOperandType(i)):
ref = insn.getAddress(i)
# strings are also referenced dynamically via pointers & arrays, so we need to deref them
if insn.getOperandType(i) == dyn_addr:
ref = insn.getAddress(i)
dat = getDataAt(ref) # type: ignore [name-defined] # noqa: F821
if dat and dat.isPointer():
ref = dat.getValue()

if ref != insn.getAddress():
ghidra_dat = getDataAt(ref) # type: ignore [name-defined] # noqa: F821
colton-gabertan marked this conversation as resolved.
Show resolved Hide resolved
if ghidra_dat and ghidra_dat.hasStringValue():
yield String(ghidra_dat.getValue()), ih.address
for addr in capa.features.extractors.ghidra.helpers.find_data_references_from_insn(ih.inner):
data = getDataAt(addr) # type: ignore [name-defined] # noqa: F821
if data and data.hasStringValue():
yield String(data.getValue()), ih.address


def extract_insn_mnemonic_features(
Expand Down Expand Up @@ -359,7 +336,7 @@ def extract_insn_cross_section_cflow(
ref = capa.features.extractors.ghidra.helpers.dereference_ptr(insn)
if capa.features.extractors.ghidra.helpers.check_addr_for_api(ref, fakes, imports, externs):
return
elif ref_type == OperandType.DYNAMIC | OperandType.ADDRESS or ref_type == OperandType.DYNAMIC:
elif ref_type == OPERAND_TYPE_DYNAMIC_ADDRESS or ref_type == OperandType.DYNAMIC:
return # cannot resolve dynamics statically
else:
# pure address does not need to get dereferenced/ handled
Expand Down
Loading