diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..71f426b --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,24 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "FF_FunctionExtractor", + "type": "debugpy", + "request": "launch", + "program": "herrewebpy/firmware_forensics/function_extractor.py", + "args": [ + "--binary", + "sample_data/firmwares/S7_BL31.bin", + "--architecture", + "ARM_AARCH64", + "--endian", + "little", + ], + "console": "integratedTerminal", + "justMyCode": false + } + ] +} \ No newline at end of file diff --git a/debug.py b/debug.py index e022498..555e15d 100644 --- a/debug.py +++ b/debug.py @@ -1,2 +1,5 @@ -from herrewebpy.bioinformatics import sequence_alignment -sequence_alignment.SequenceAlignment(['aa', 'bb', 'cc'],['bb','aa','cc'], ['1','2','3'], ['1','2','3']) \ No newline at end of file +#from herrewebpy.bioinformatics import sequence_alignment +#sequence_alignment.SequenceAlignment(['aa', 'bb', 'cc'],['bb','aa','cc'], ['1','2','3'], ['1','2','3']) + +from herrewebpy.firmware_forensics import function_extractor +function_extractor.FunctionExtractor('', 'ARM_AARCH64') \ No newline at end of file diff --git a/herrewebpy/firmware_forensics/__init__.py b/herrewebpy/firmware_forensics/__init__.py new file mode 100644 index 0000000..b974282 --- /dev/null +++ b/herrewebpy/firmware_forensics/__init__.py @@ -0,0 +1 @@ +from . import * \ No newline at end of file diff --git a/herrewebpy/firmware_forensics/function_extractor.py b/herrewebpy/firmware_forensics/function_extractor.py new file mode 100644 index 0000000..67cb9fe --- /dev/null +++ b/herrewebpy/firmware_forensics/function_extractor.py @@ -0,0 +1,160 @@ +import argparse +import sys +from capstone import * +from capstone.arm64 import ARM64_OP_IMM +from tqdm import tqdm +import pandas as pd +from herrewebpy import logger + +class ARM_AARCH32_Parser: + def __init__(self, binary_bytes): + self.binary_bytes = binary_bytes + self.functions = [] + + def extract_functions(self): + # Extract functions from ARM AARCH32 binary. Not implemented yet. + return NotImplemented + + +class ARM_AARCH64_Parser: + def __init__(self, binary_bytes, start_address=0x0): + self.binary_bytes = binary_bytes + self.start_address = start_address + self.functions = [] + + def extract_functions(self, df, endian): + if endian == 'big': + md = Cs(CS_ARCH_ARM64, CS_MODE_ARM | CS_MODE_BIG_ENDIAN) + else: + md = Cs(CS_ARCH_ARM64, CS_MODE_ARM) + + md.detail = True + + instructions = list(md.disasm(self.binary_bytes, self.start_address)) + func_start = None + func_name = None + seen_functions = set() + call_stack = [] + + for offset in tqdm(range(0, len(self.binary_bytes), 4), desc="Scanning binary for functions"): + instruction_bytes = self.binary_bytes[offset:offset+4] + if len(instruction_bytes) < 4: + continue + + try: + instruction = next(md.disasm(instruction_bytes, offset), None) + if instruction is None: + continue + except Exception: + continue + + if func_start is None: + if instruction.mnemonic in ['stp', 'sub'] and 'sp' in instruction.op_str: + func_start = instruction.address + func_name = f"func_{func_start:x}" + seen_functions.add(func_start) + new_row = pd.DataFrame([{ + 'FUNC_ADDRESS': func_name, + 'LOCATION': func_start, + 'REFERENCES': [] + }]) + df = pd.concat([df, new_row], ignore_index=True) + + if instruction.mnemonic in ['ret', 'bx'] and instruction.op_str == 'lr': + func_start = None + func_name = None + + if instruction.mnemonic in ['bl', 'blx']: + if len(instruction.operands) > 0 and instruction.operands[0].type == ARM64_OP_IMM: + target_address = instruction.operands[0].imm + if not df.empty: + df.at[df.index[-1], 'REFERENCES'].append(f"func_{target_address:x}") + + if target_address not in seen_functions: + call_stack.append(target_address) + + return df + + def process_function(self, start_address, df, md, seen_functions, call_stack): + if start_address in seen_functions: + return + + seen_functions.add(start_address) + func_start = start_address + func_name = f"func_{start_address:x}" + references = [] + + for instruction in md.disasm(self.binary_bytes[start_address:], start_address): + if instruction.mnemonic in ['ret', 'bx'] and instruction.op_str == 'lr': + new_row = pd.DataFrame([{ + 'FUNC_ADDRESS': func_name, + 'LOCATION': func_start, + 'REFERENCES': references + }]) + df = pd.concat([df, new_row], ignore_index=True) + break + + if instruction.mnemonic in ['bl', 'blx']: + target_address = instruction.operands[0].imm + references.append(f"func_{target_address:x}") + if target_address not in seen_functions: + call_stack.append(target_address) + + return df + +class FunctionExtractor: + def __init__(self, binary, architecture, endian): + self.binary = binary + self.architecture = architecture.upper() + self.endian = endian + self.dataframe = pd.DataFrame(columns=['FUNC_ADDRESS', 'LOCATION', 'REFERENCES']) + self.extract_functions() + + def binary_bytes(self): + with open(self.binary, 'rb') as f: + return f.read() + + def extract_functions(self): + binary_bytes = self.binary_bytes() + + if self.architecture == 'ARM_AARCH32': + self.parser = ARM_AARCH32_Parser(binary_bytes) + self.functions = self.parser.extract_functions() + elif self.architecture == 'ARM_AARCH64': + self.parser = ARM_AARCH64_Parser(binary_bytes) + self.dataframe = self.parser.extract_functions(self.dataframe, self.endian) + else: + logger.error('Architecture not supported: ' + self.architecture) + return + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Extract functions from firmware binaries.') + parser.add_argument('--binary', help='The binary file to extract functions from.', required=True, type=str) + parser.add_argument('--output', help='The output file to write the functions to.', required=False, type=str) + parser.add_argument( + '--architecture', + help='The architecture of the binary. Use: ARM_AARCH64, ARM_AARCH32, etc.', + required=True, + type=str, + choices=['ARM_AARCH64', 'ARM_AARCH32'] + ) + parser.add_argument( + '--endian', + help='The endianness of the binary. Use: little, big.', + required=True, + type=str, + choices=['little', 'big'] + ) + args = parser.parse_args() + + logger.info('Extracting functions from binary: ' + args.binary, 'using architecture: ' + args.architecture.upper()) + + df = FunctionExtractor(args.binary, args.architecture, args.endian) + + if not args.output: + args.output = args.binary + '.functions' + + with open(args.output, 'w') as f: + df.dataframe.to_csv(f, index=False) + + sys.path.append('.') diff --git a/requirements.txt b/requirements.txt index 74c894b..7a8035d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,10 @@ pandas numpy tensorflow -sklearn +scikit-learn seaborn -scikit-learn \ No newline at end of file +scikit-learn +capstone +keystone +plotly +BioPython \ No newline at end of file diff --git a/sample_data/firmwares/S7_BL31.bin b/sample_data/firmwares/S7_BL31.bin new file mode 100644 index 0000000..72d4b29 Binary files /dev/null and b/sample_data/firmwares/S7_BL31.bin differ