Adding initial setup for function_extractor from binary files, (for use in graph networking later on)
This commit is contained in:
parent
ce2384cb72
commit
3680b94d17
24
.vscode/launch.json
vendored
Normal file
24
.vscode/launch.json
vendored
Normal file
@ -0,0 +1,24 @@
|
||||
{
|
||||
// Use IntelliSense to learn about possible attributes.
|
||||
// Hover to view descriptions of existing attributes.
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "FF_FunctionExtractor",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "herrewebpy/firmware_forensics/function_extractor.py",
|
||||
"args": [
|
||||
"--binary",
|
||||
"sample_data/firmwares/S7_BL31.bin",
|
||||
"--architecture",
|
||||
"ARM_AARCH64",
|
||||
"--endian",
|
||||
"little",
|
||||
],
|
||||
"console": "integratedTerminal",
|
||||
"justMyCode": false
|
||||
}
|
||||
]
|
||||
}
|
7
debug.py
7
debug.py
@ -1,2 +1,5 @@
|
||||
from herrewebpy.bioinformatics import sequence_alignment
|
||||
sequence_alignment.SequenceAlignment(['aa', 'bb', 'cc'],['bb','aa','cc'], ['1','2','3'], ['1','2','3'])
|
||||
#from herrewebpy.bioinformatics import sequence_alignment
|
||||
#sequence_alignment.SequenceAlignment(['aa', 'bb', 'cc'],['bb','aa','cc'], ['1','2','3'], ['1','2','3'])
|
||||
|
||||
from herrewebpy.firmware_forensics import function_extractor
|
||||
function_extractor.FunctionExtractor('', 'ARM_AARCH64')
|
1
herrewebpy/firmware_forensics/__init__.py
Normal file
1
herrewebpy/firmware_forensics/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
from . import *
|
160
herrewebpy/firmware_forensics/function_extractor.py
Normal file
160
herrewebpy/firmware_forensics/function_extractor.py
Normal file
@ -0,0 +1,160 @@
|
||||
import argparse
|
||||
import sys
|
||||
from capstone import *
|
||||
from capstone.arm64 import ARM64_OP_IMM
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
from herrewebpy import logger
|
||||
|
||||
class ARM_AARCH32_Parser:
|
||||
def __init__(self, binary_bytes):
|
||||
self.binary_bytes = binary_bytes
|
||||
self.functions = []
|
||||
|
||||
def extract_functions(self):
|
||||
# Extract functions from ARM AARCH32 binary. Not implemented yet.
|
||||
return NotImplemented
|
||||
|
||||
|
||||
class ARM_AARCH64_Parser:
|
||||
def __init__(self, binary_bytes, start_address=0x0):
|
||||
self.binary_bytes = binary_bytes
|
||||
self.start_address = start_address
|
||||
self.functions = []
|
||||
|
||||
def extract_functions(self, df, endian):
|
||||
if endian == 'big':
|
||||
md = Cs(CS_ARCH_ARM64, CS_MODE_ARM | CS_MODE_BIG_ENDIAN)
|
||||
else:
|
||||
md = Cs(CS_ARCH_ARM64, CS_MODE_ARM)
|
||||
|
||||
md.detail = True
|
||||
|
||||
instructions = list(md.disasm(self.binary_bytes, self.start_address))
|
||||
func_start = None
|
||||
func_name = None
|
||||
seen_functions = set()
|
||||
call_stack = []
|
||||
|
||||
for offset in tqdm(range(0, len(self.binary_bytes), 4), desc="Scanning binary for functions"):
|
||||
instruction_bytes = self.binary_bytes[offset:offset+4]
|
||||
if len(instruction_bytes) < 4:
|
||||
continue
|
||||
|
||||
try:
|
||||
instruction = next(md.disasm(instruction_bytes, offset), None)
|
||||
if instruction is None:
|
||||
continue
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if func_start is None:
|
||||
if instruction.mnemonic in ['stp', 'sub'] and 'sp' in instruction.op_str:
|
||||
func_start = instruction.address
|
||||
func_name = f"func_{func_start:x}"
|
||||
seen_functions.add(func_start)
|
||||
new_row = pd.DataFrame([{
|
||||
'FUNC_ADDRESS': func_name,
|
||||
'LOCATION': func_start,
|
||||
'REFERENCES': []
|
||||
}])
|
||||
df = pd.concat([df, new_row], ignore_index=True)
|
||||
|
||||
if instruction.mnemonic in ['ret', 'bx'] and instruction.op_str == 'lr':
|
||||
func_start = None
|
||||
func_name = None
|
||||
|
||||
if instruction.mnemonic in ['bl', 'blx']:
|
||||
if len(instruction.operands) > 0 and instruction.operands[0].type == ARM64_OP_IMM:
|
||||
target_address = instruction.operands[0].imm
|
||||
if not df.empty:
|
||||
df.at[df.index[-1], 'REFERENCES'].append(f"func_{target_address:x}")
|
||||
|
||||
if target_address not in seen_functions:
|
||||
call_stack.append(target_address)
|
||||
|
||||
return df
|
||||
|
||||
def process_function(self, start_address, df, md, seen_functions, call_stack):
|
||||
if start_address in seen_functions:
|
||||
return
|
||||
|
||||
seen_functions.add(start_address)
|
||||
func_start = start_address
|
||||
func_name = f"func_{start_address:x}"
|
||||
references = []
|
||||
|
||||
for instruction in md.disasm(self.binary_bytes[start_address:], start_address):
|
||||
if instruction.mnemonic in ['ret', 'bx'] and instruction.op_str == 'lr':
|
||||
new_row = pd.DataFrame([{
|
||||
'FUNC_ADDRESS': func_name,
|
||||
'LOCATION': func_start,
|
||||
'REFERENCES': references
|
||||
}])
|
||||
df = pd.concat([df, new_row], ignore_index=True)
|
||||
break
|
||||
|
||||
if instruction.mnemonic in ['bl', 'blx']:
|
||||
target_address = instruction.operands[0].imm
|
||||
references.append(f"func_{target_address:x}")
|
||||
if target_address not in seen_functions:
|
||||
call_stack.append(target_address)
|
||||
|
||||
return df
|
||||
|
||||
class FunctionExtractor:
|
||||
def __init__(self, binary, architecture, endian):
|
||||
self.binary = binary
|
||||
self.architecture = architecture.upper()
|
||||
self.endian = endian
|
||||
self.dataframe = pd.DataFrame(columns=['FUNC_ADDRESS', 'LOCATION', 'REFERENCES'])
|
||||
self.extract_functions()
|
||||
|
||||
def binary_bytes(self):
|
||||
with open(self.binary, 'rb') as f:
|
||||
return f.read()
|
||||
|
||||
def extract_functions(self):
|
||||
binary_bytes = self.binary_bytes()
|
||||
|
||||
if self.architecture == 'ARM_AARCH32':
|
||||
self.parser = ARM_AARCH32_Parser(binary_bytes)
|
||||
self.functions = self.parser.extract_functions()
|
||||
elif self.architecture == 'ARM_AARCH64':
|
||||
self.parser = ARM_AARCH64_Parser(binary_bytes)
|
||||
self.dataframe = self.parser.extract_functions(self.dataframe, self.endian)
|
||||
else:
|
||||
logger.error('Architecture not supported: ' + self.architecture)
|
||||
return
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Extract functions from firmware binaries.')
|
||||
parser.add_argument('--binary', help='The binary file to extract functions from.', required=True, type=str)
|
||||
parser.add_argument('--output', help='The output file to write the functions to.', required=False, type=str)
|
||||
parser.add_argument(
|
||||
'--architecture',
|
||||
help='The architecture of the binary. Use: ARM_AARCH64, ARM_AARCH32, etc.',
|
||||
required=True,
|
||||
type=str,
|
||||
choices=['ARM_AARCH64', 'ARM_AARCH32']
|
||||
)
|
||||
parser.add_argument(
|
||||
'--endian',
|
||||
help='The endianness of the binary. Use: little, big.',
|
||||
required=True,
|
||||
type=str,
|
||||
choices=['little', 'big']
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
logger.info('Extracting functions from binary: ' + args.binary, 'using architecture: ' + args.architecture.upper())
|
||||
|
||||
df = FunctionExtractor(args.binary, args.architecture, args.endian)
|
||||
|
||||
if not args.output:
|
||||
args.output = args.binary + '.functions'
|
||||
|
||||
with open(args.output, 'w') as f:
|
||||
df.dataframe.to_csv(f, index=False)
|
||||
|
||||
sys.path.append('.')
|
@ -1,6 +1,10 @@
|
||||
pandas
|
||||
numpy
|
||||
tensorflow
|
||||
sklearn
|
||||
scikit-learn
|
||||
seaborn
|
||||
scikit-learn
|
||||
scikit-learn
|
||||
capstone
|
||||
keystone
|
||||
plotly
|
||||
BioPython
|
BIN
sample_data/firmwares/S7_BL31.bin
Normal file
BIN
sample_data/firmwares/S7_BL31.bin
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user