Adding initial setup for function_extractor from binary files, (for use in graph networking later on)

2024-09-07 21:41:17 +02:00 · 2024-09-07 21:41:17 +02:00 · 3680b94d17
commit 3680b94d17
parent ce2384cb72
6 changed files with 196 additions and 4 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -0,0 +1,24 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "FF_FunctionExtractor",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "herrewebpy/firmware_forensics/function_extractor.py",
+            "args": [
+                "--binary",
+                "sample_data/firmwares/S7_BL31.bin",
+                "--architecture",
+                "ARM_AARCH64",
+                "--endian",
+                "little",
+            ],
+            "console": "integratedTerminal",
+            "justMyCode": false
+        }
+    ]
+}
--- a/debug.py
+++ b/debug.py
@ -1,2 +1,5 @@
-from herrewebpy.bioinformatics import sequence_alignment
-sequence_alignment.SequenceAlignment(['aa', 'bb', 'cc'],['bb','aa','cc'], ['1','2','3'], ['1','2','3'])
+#from herrewebpy.bioinformatics import sequence_alignment
+#sequence_alignment.SequenceAlignment(['aa', 'bb', 'cc'],['bb','aa','cc'], ['1','2','3'], ['1','2','3'])
+
+from herrewebpy.firmware_forensics import function_extractor
+function_extractor.FunctionExtractor('', 'ARM_AARCH64')
--- a/herrewebpy/firmware_forensics/init.py
+++ b/herrewebpy/firmware_forensics/init.py
@ -0,0 +1 @@
+from . import *
--- a/herrewebpy/firmware_forensics/function_extractor.py
+++ b/herrewebpy/firmware_forensics/function_extractor.py
@ -0,0 +1,160 @@
+import argparse
+import sys
+from capstone import *
+from capstone.arm64 import ARM64_OP_IMM
+from tqdm import tqdm
+import pandas as pd
+from herrewebpy import logger
+
+class ARM_AARCH32_Parser:
+    def __init__(self, binary_bytes):
+        self.binary_bytes = binary_bytes
+        self.functions = []
+
+    def extract_functions(self):
+        # Extract functions from ARM AARCH32 binary. Not implemented yet.
+        return NotImplemented
+    
+
+class ARM_AARCH64_Parser:
+    def __init__(self, binary_bytes, start_address=0x0):
+        self.binary_bytes = binary_bytes
+        self.start_address = start_address
+        self.functions = []
+
+    def extract_functions(self, df, endian):
+        if endian == 'big':
+            md = Cs(CS_ARCH_ARM64, CS_MODE_ARM | CS_MODE_BIG_ENDIAN)
+        else:
+            md = Cs(CS_ARCH_ARM64, CS_MODE_ARM)
+
+        md.detail = True
+
+        instructions = list(md.disasm(self.binary_bytes, self.start_address))
+        func_start = None
+        func_name = None
+        seen_functions = set()
+        call_stack = []
+
+        for offset in tqdm(range(0, len(self.binary_bytes), 4), desc="Scanning binary for functions"):
+            instruction_bytes = self.binary_bytes[offset:offset+4]
+            if len(instruction_bytes) < 4:
+                continue
+
+            try:
+                instruction = next(md.disasm(instruction_bytes, offset), None)
+                if instruction is None:
+                    continue
+            except Exception:
+                continue
+
+            if func_start is None:
+                if instruction.mnemonic in ['stp', 'sub'] and 'sp' in instruction.op_str:
+                    func_start = instruction.address
+                    func_name = f"func_{func_start:x}"
+                    seen_functions.add(func_start)
+                    new_row = pd.DataFrame([{
+                        'FUNC_ADDRESS': func_name,
+                        'LOCATION': func_start,
+                        'REFERENCES': []
+                    }])
+                    df = pd.concat([df, new_row], ignore_index=True)
+
+            if instruction.mnemonic in ['ret', 'bx'] and instruction.op_str == 'lr':
+                func_start = None
+                func_name = None
+
+            if instruction.mnemonic in ['bl', 'blx']:
+                if len(instruction.operands) > 0 and instruction.operands[0].type == ARM64_OP_IMM:
+                    target_address = instruction.operands[0].imm
+                    if not df.empty:
+                        df.at[df.index[-1], 'REFERENCES'].append(f"func_{target_address:x}")
+
+                    if target_address not in seen_functions:
+                        call_stack.append(target_address)
+
+        return df
+
+    def process_function(self, start_address, df, md, seen_functions, call_stack):
+        if start_address in seen_functions:
+            return
+
+        seen_functions.add(start_address)
+        func_start = start_address
+        func_name = f"func_{start_address:x}"
+        references = []
+
+        for instruction in md.disasm(self.binary_bytes[start_address:], start_address):
+            if instruction.mnemonic in ['ret', 'bx'] and instruction.op_str == 'lr':
+                new_row = pd.DataFrame([{
+                    'FUNC_ADDRESS': func_name,
+                    'LOCATION': func_start,
+                    'REFERENCES': references
+                }])
+                df = pd.concat([df, new_row], ignore_index=True)
+                break
+
+            if instruction.mnemonic in ['bl', 'blx']:
+                target_address = instruction.operands[0].imm
+                references.append(f"func_{target_address:x}")
+                if target_address not in seen_functions:
+                    call_stack.append(target_address)
+
+        return df
+
+class FunctionExtractor:
+    def __init__(self, binary, architecture, endian):
+        self.binary = binary
+        self.architecture = architecture.upper()
+        self.endian = endian
+        self.dataframe = pd.DataFrame(columns=['FUNC_ADDRESS', 'LOCATION', 'REFERENCES'])
+        self.extract_functions()
+
+    def binary_bytes(self):
+        with open(self.binary, 'rb') as f:
+            return f.read()
+
+    def extract_functions(self):
+        binary_bytes = self.binary_bytes()
+
+        if self.architecture == 'ARM_AARCH32':
+            self.parser = ARM_AARCH32_Parser(binary_bytes)
+            self.functions = self.parser.extract_functions()
+        elif self.architecture == 'ARM_AARCH64':
+            self.parser = ARM_AARCH64_Parser(binary_bytes)
+            self.dataframe = self.parser.extract_functions(self.dataframe, self.endian)
+        else:
+            logger.error('Architecture not supported: ' + self.architecture)
+            return
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Extract functions from firmware binaries.')
+    parser.add_argument('--binary', help='The binary file to extract functions from.', required=True, type=str)
+    parser.add_argument('--output', help='The output file to write the functions to.', required=False, type=str)
+    parser.add_argument(
+        '--architecture',
+        help='The architecture of the binary. Use: ARM_AARCH64, ARM_AARCH32, etc.',
+        required=True,
+        type=str,
+        choices=['ARM_AARCH64', 'ARM_AARCH32']
+    )
+    parser.add_argument(
+        '--endian', 
+        help='The endianness of the binary. Use: little, big.', 
+        required=True, 
+        type=str, 
+        choices=['little', 'big']
+    )
+    args = parser.parse_args()
+
+    logger.info('Extracting functions from binary: ' + args.binary, 'using architecture: ' + args.architecture.upper())
+
+    df = FunctionExtractor(args.binary, args.architecture, args.endian)
+
+    if not args.output:
+        args.output = args.binary + '.functions'
+
+    with open(args.output, 'w') as f:
+        df.dataframe.to_csv(f, index=False)
+
+    sys.path.append('.')
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,10 @@
 pandas
 numpy
 tensorflow
-sklearn
+scikit-learn
 seaborn
-scikit-learn
+scikit-learn
+capstone
+keystone
+plotly
+BioPython
--- a/sample_data/firmwares/S7_BL31.bin
+++ b/sample_data/firmwares/S7_BL31.bin