161 lines
5.8 KiB
Python
161 lines
5.8 KiB
Python
import argparse
|
|
import sys
|
|
from capstone import *
|
|
from capstone.arm64 import ARM64_OP_IMM
|
|
from tqdm import tqdm
|
|
import pandas as pd
|
|
from herrewebpy import logger
|
|
|
|
class ARM_AARCH32_Parser:
|
|
def __init__(self, binary_bytes):
|
|
self.binary_bytes = binary_bytes
|
|
self.functions = []
|
|
|
|
def extract_functions(self):
|
|
# Extract functions from ARM AARCH32 binary. Not implemented yet.
|
|
return NotImplemented
|
|
|
|
|
|
class ARM_AARCH64_Parser:
|
|
def __init__(self, binary_bytes, start_address=0x0):
|
|
self.binary_bytes = binary_bytes
|
|
self.start_address = start_address
|
|
self.functions = []
|
|
|
|
def extract_functions(self, df, endian):
|
|
if endian == 'big':
|
|
md = Cs(CS_ARCH_ARM64, CS_MODE_ARM | CS_MODE_BIG_ENDIAN)
|
|
else:
|
|
md = Cs(CS_ARCH_ARM64, CS_MODE_ARM)
|
|
|
|
md.detail = True
|
|
|
|
instructions = list(md.disasm(self.binary_bytes, self.start_address))
|
|
func_start = None
|
|
func_name = None
|
|
seen_functions = set()
|
|
call_stack = []
|
|
|
|
for offset in tqdm(range(0, len(self.binary_bytes), 4), desc="Scanning binary for functions"):
|
|
instruction_bytes = self.binary_bytes[offset:offset+4]
|
|
if len(instruction_bytes) < 4:
|
|
continue
|
|
|
|
try:
|
|
instruction = next(md.disasm(instruction_bytes, offset), None)
|
|
if instruction is None:
|
|
continue
|
|
except Exception:
|
|
continue
|
|
|
|
if func_start is None:
|
|
if instruction.mnemonic in ['stp', 'sub'] and 'sp' in instruction.op_str:
|
|
func_start = instruction.address
|
|
func_name = f"func_{func_start:x}"
|
|
seen_functions.add(func_start)
|
|
new_row = pd.DataFrame([{
|
|
'FUNC_ADDRESS': func_name,
|
|
'LOCATION': func_start,
|
|
'REFERENCES': []
|
|
}])
|
|
df = pd.concat([df, new_row], ignore_index=True)
|
|
|
|
if instruction.mnemonic in ['ret', 'bx'] and instruction.op_str == 'lr':
|
|
func_start = None
|
|
func_name = None
|
|
|
|
if instruction.mnemonic in ['bl', 'blx']:
|
|
if len(instruction.operands) > 0 and instruction.operands[0].type == ARM64_OP_IMM:
|
|
target_address = instruction.operands[0].imm
|
|
if not df.empty:
|
|
df.at[df.index[-1], 'REFERENCES'].append(f"func_{target_address:x}")
|
|
|
|
if target_address not in seen_functions:
|
|
call_stack.append(target_address)
|
|
|
|
return df
|
|
|
|
def process_function(self, start_address, df, md, seen_functions, call_stack):
|
|
if start_address in seen_functions:
|
|
return
|
|
|
|
seen_functions.add(start_address)
|
|
func_start = start_address
|
|
func_name = f"func_{start_address:x}"
|
|
references = []
|
|
|
|
for instruction in md.disasm(self.binary_bytes[start_address:], start_address):
|
|
if instruction.mnemonic in ['ret', 'bx'] and instruction.op_str == 'lr':
|
|
new_row = pd.DataFrame([{
|
|
'FUNC_ADDRESS': func_name,
|
|
'LOCATION': func_start,
|
|
'REFERENCES': references
|
|
}])
|
|
df = pd.concat([df, new_row], ignore_index=True)
|
|
break
|
|
|
|
if instruction.mnemonic in ['bl', 'blx']:
|
|
target_address = instruction.operands[0].imm
|
|
references.append(f"func_{target_address:x}")
|
|
if target_address not in seen_functions:
|
|
call_stack.append(target_address)
|
|
|
|
return df
|
|
|
|
class FunctionExtractor:
|
|
def __init__(self, binary, architecture, endian):
|
|
self.binary = binary
|
|
self.architecture = architecture.upper()
|
|
self.endian = endian
|
|
self.dataframe = pd.DataFrame(columns=['FUNC_ADDRESS', 'LOCATION', 'REFERENCES'])
|
|
self.extract_functions()
|
|
|
|
def binary_bytes(self):
|
|
with open(self.binary, 'rb') as f:
|
|
return f.read()
|
|
|
|
def extract_functions(self):
|
|
binary_bytes = self.binary_bytes()
|
|
|
|
if self.architecture == 'ARM_AARCH32':
|
|
self.parser = ARM_AARCH32_Parser(binary_bytes)
|
|
self.functions = self.parser.extract_functions()
|
|
elif self.architecture == 'ARM_AARCH64':
|
|
self.parser = ARM_AARCH64_Parser(binary_bytes)
|
|
self.dataframe = self.parser.extract_functions(self.dataframe, self.endian)
|
|
else:
|
|
logger.error('Architecture not supported: ' + self.architecture)
|
|
return
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description='Extract functions from firmware binaries.')
|
|
parser.add_argument('--binary', help='The binary file to extract functions from.', required=True, type=str)
|
|
parser.add_argument('--output', help='The output file to write the functions to.', required=False, type=str)
|
|
parser.add_argument(
|
|
'--architecture',
|
|
help='The architecture of the binary. Use: ARM_AARCH64, ARM_AARCH32, etc.',
|
|
required=True,
|
|
type=str,
|
|
choices=['ARM_AARCH64', 'ARM_AARCH32']
|
|
)
|
|
parser.add_argument(
|
|
'--endian',
|
|
help='The endianness of the binary. Use: little, big.',
|
|
required=True,
|
|
type=str,
|
|
choices=['little', 'big']
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
logger.info('Extracting functions from binary: ' + args.binary, 'using architecture: ' + args.architecture.upper())
|
|
|
|
df = FunctionExtractor(args.binary, args.architecture, args.endian)
|
|
|
|
if not args.output:
|
|
args.output = args.binary + '.functions'
|
|
|
|
with open(args.output, 'w') as f:
|
|
df.dataframe.to_csv(f, index=False)
|
|
|
|
sys.path.append('.')
|