HerrewebPy/herrewebpy/firmware_forensics/function_extractor.py

161 lines
5.8 KiB
Python

import argparse
import sys
from capstone import *
from capstone.arm64 import ARM64_OP_IMM
from tqdm import tqdm
import pandas as pd
from herrewebpy import logger
class ARM_AARCH32_Parser:
def __init__(self, binary_bytes):
self.binary_bytes = binary_bytes
self.functions = []
def extract_functions(self):
# Extract functions from ARM AARCH32 binary. Not implemented yet.
return NotImplemented
class ARM_AARCH64_Parser:
def __init__(self, binary_bytes, start_address=0x0):
self.binary_bytes = binary_bytes
self.start_address = start_address
self.functions = []
def extract_functions(self, df, endian):
if endian == 'big':
md = Cs(CS_ARCH_ARM64, CS_MODE_ARM | CS_MODE_BIG_ENDIAN)
else:
md = Cs(CS_ARCH_ARM64, CS_MODE_ARM)
md.detail = True
instructions = list(md.disasm(self.binary_bytes, self.start_address))
func_start = None
func_name = None
seen_functions = set()
call_stack = []
for offset in tqdm(range(0, len(self.binary_bytes), 4), desc="Scanning binary for functions"):
instruction_bytes = self.binary_bytes[offset:offset+4]
if len(instruction_bytes) < 4:
continue
try:
instruction = next(md.disasm(instruction_bytes, offset), None)
if instruction is None:
continue
except Exception:
continue
if func_start is None:
if instruction.mnemonic in ['stp', 'sub'] and 'sp' in instruction.op_str:
func_start = instruction.address
func_name = f"func_{func_start:x}"
seen_functions.add(func_start)
new_row = pd.DataFrame([{
'FUNC_ADDRESS': func_name,
'LOCATION': func_start,
'REFERENCES': []
}])
df = pd.concat([df, new_row], ignore_index=True)
if instruction.mnemonic in ['ret', 'bx'] and instruction.op_str == 'lr':
func_start = None
func_name = None
if instruction.mnemonic in ['bl', 'blx']:
if len(instruction.operands) > 0 and instruction.operands[0].type == ARM64_OP_IMM:
target_address = instruction.operands[0].imm
if not df.empty:
df.at[df.index[-1], 'REFERENCES'].append(f"func_{target_address:x}")
if target_address not in seen_functions:
call_stack.append(target_address)
return df
def process_function(self, start_address, df, md, seen_functions, call_stack):
if start_address in seen_functions:
return
seen_functions.add(start_address)
func_start = start_address
func_name = f"func_{start_address:x}"
references = []
for instruction in md.disasm(self.binary_bytes[start_address:], start_address):
if instruction.mnemonic in ['ret', 'bx'] and instruction.op_str == 'lr':
new_row = pd.DataFrame([{
'FUNC_ADDRESS': func_name,
'LOCATION': func_start,
'REFERENCES': references
}])
df = pd.concat([df, new_row], ignore_index=True)
break
if instruction.mnemonic in ['bl', 'blx']:
target_address = instruction.operands[0].imm
references.append(f"func_{target_address:x}")
if target_address not in seen_functions:
call_stack.append(target_address)
return df
class FunctionExtractor:
def __init__(self, binary, architecture, endian):
self.binary = binary
self.architecture = architecture.upper()
self.endian = endian
self.dataframe = pd.DataFrame(columns=['FUNC_ADDRESS', 'LOCATION', 'REFERENCES'])
self.extract_functions()
def binary_bytes(self):
with open(self.binary, 'rb') as f:
return f.read()
def extract_functions(self):
binary_bytes = self.binary_bytes()
if self.architecture == 'ARM_AARCH32':
self.parser = ARM_AARCH32_Parser(binary_bytes)
self.functions = self.parser.extract_functions()
elif self.architecture == 'ARM_AARCH64':
self.parser = ARM_AARCH64_Parser(binary_bytes)
self.dataframe = self.parser.extract_functions(self.dataframe, self.endian)
else:
logger.error('Architecture not supported: ' + self.architecture)
return
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Extract functions from firmware binaries.')
parser.add_argument('--binary', help='The binary file to extract functions from.', required=True, type=str)
parser.add_argument('--output', help='The output file to write the functions to.', required=False, type=str)
parser.add_argument(
'--architecture',
help='The architecture of the binary. Use: ARM_AARCH64, ARM_AARCH32, etc.',
required=True,
type=str,
choices=['ARM_AARCH64', 'ARM_AARCH32']
)
parser.add_argument(
'--endian',
help='The endianness of the binary. Use: little, big.',
required=True,
type=str,
choices=['little', 'big']
)
args = parser.parse_args()
logger.info('Extracting functions from binary: ' + args.binary, 'using architecture: ' + args.architecture.upper())
df = FunctionExtractor(args.binary, args.architecture, args.endian)
if not args.output:
args.output = args.binary + '.functions'
with open(args.output, 'w') as f:
df.dataframe.to_csv(f, index=False)
sys.path.append('.')