import argparse import sys from capstone import * from capstone.arm64 import ARM64_OP_IMM from tqdm import tqdm import pandas as pd from herrewebpy import logger class ARM_AARCH32_Parser: def __init__(self, binary_bytes): self.binary_bytes = binary_bytes self.functions = [] def extract_functions(self): # Extract functions from ARM AARCH32 binary. Not implemented yet. return NotImplemented class ARM_AARCH64_Parser: def __init__(self, binary_bytes, start_address=0x0): self.binary_bytes = binary_bytes self.start_address = start_address self.functions = [] def extract_functions(self, df, endian): if endian == 'big': md = Cs(CS_ARCH_ARM64, CS_MODE_ARM | CS_MODE_BIG_ENDIAN) else: md = Cs(CS_ARCH_ARM64, CS_MODE_ARM) md.detail = True instructions = list(md.disasm(self.binary_bytes, self.start_address)) func_start = None func_name = None seen_functions = set() call_stack = [] for offset in tqdm(range(0, len(self.binary_bytes), 4), desc="Scanning binary for functions"): instruction_bytes = self.binary_bytes[offset:offset+4] if len(instruction_bytes) < 4: continue try: instruction = next(md.disasm(instruction_bytes, offset), None) if instruction is None: continue except Exception: continue if func_start is None: if instruction.mnemonic in ['stp', 'sub'] and 'sp' in instruction.op_str: func_start = instruction.address func_name = f"func_{func_start:x}" seen_functions.add(func_start) new_row = pd.DataFrame([{ 'FUNC_ADDRESS': func_name, 'LOCATION': func_start, 'REFERENCES': [] }]) df = pd.concat([df, new_row], ignore_index=True) if instruction.mnemonic in ['ret', 'bx'] and instruction.op_str == 'lr': func_start = None func_name = None if instruction.mnemonic in ['bl', 'blx']: if len(instruction.operands) > 0 and instruction.operands[0].type == ARM64_OP_IMM: target_address = instruction.operands[0].imm if not df.empty: df.at[df.index[-1], 'REFERENCES'].append(f"func_{target_address:x}") if target_address not in seen_functions: call_stack.append(target_address) return df def process_function(self, start_address, df, md, seen_functions, call_stack): if start_address in seen_functions: return seen_functions.add(start_address) func_start = start_address func_name = f"func_{start_address:x}" references = [] for instruction in md.disasm(self.binary_bytes[start_address:], start_address): if instruction.mnemonic in ['ret', 'bx'] and instruction.op_str == 'lr': new_row = pd.DataFrame([{ 'FUNC_ADDRESS': func_name, 'LOCATION': func_start, 'REFERENCES': references }]) df = pd.concat([df, new_row], ignore_index=True) break if instruction.mnemonic in ['bl', 'blx']: target_address = instruction.operands[0].imm references.append(f"func_{target_address:x}") if target_address not in seen_functions: call_stack.append(target_address) return df class FunctionExtractor: def __init__(self, binary, architecture, endian): self.binary = binary self.architecture = architecture.upper() self.endian = endian self.dataframe = pd.DataFrame(columns=['FUNC_ADDRESS', 'LOCATION', 'REFERENCES']) self.extract_functions() def binary_bytes(self): with open(self.binary, 'rb') as f: return f.read() def extract_functions(self): binary_bytes = self.binary_bytes() if self.architecture == 'ARM_AARCH32': self.parser = ARM_AARCH32_Parser(binary_bytes) self.functions = self.parser.extract_functions() elif self.architecture == 'ARM_AARCH64': self.parser = ARM_AARCH64_Parser(binary_bytes) self.dataframe = self.parser.extract_functions(self.dataframe, self.endian) else: logger.error('Architecture not supported: ' + self.architecture) return if __name__ == "__main__": parser = argparse.ArgumentParser(description='Extract functions from firmware binaries.') parser.add_argument('--binary', help='The binary file to extract functions from.', required=True, type=str) parser.add_argument('--output', help='The output file to write the functions to.', required=False, type=str) parser.add_argument( '--architecture', help='The architecture of the binary. Use: ARM_AARCH64, ARM_AARCH32, etc.', required=True, type=str, choices=['ARM_AARCH64', 'ARM_AARCH32'] ) parser.add_argument( '--endian', help='The endianness of the binary. Use: little, big.', required=True, type=str, choices=['little', 'big'] ) args = parser.parse_args() logger.info('Extracting functions from binary: ' + args.binary, 'using architecture: ' + args.architecture.upper()) df = FunctionExtractor(args.binary, args.architecture, args.endian) if not args.output: args.output = args.binary + '.functions' with open(args.output, 'w') as f: df.dataframe.to_csv(f, index=False) sys.path.append('.')