Shofel2_T124_python/venv/lib/python3.10/site-packages/pefile.py

7854 lines
285 KiB
Python

#!/usr/bin/python
"""pefile, Portable Executable reader module
All the PE file basic structures are available with their default names as
attributes of the instance returned.
Processed elements such as the import table are made available with lowercase
names, to differentiate them from the upper case basic structure names.
pefile has been tested against many edge cases such as corrupted and malformed
PEs as well as malware, which often attempts to abuse the format way beyond its
standard use. To the best of my knowledge most of the abuse is handled
gracefully.
Copyright (c) 2005-2023 Ero Carrera <ero.carrera@gmail.com>
"""
__author__ = "Ero Carrera"
__version__ = "2023.2.7"
__contact__ = "ero.carrera@gmail.com"
import collections
import os
import struct
import codecs
import time
import math
import string
import mmap
import uuid
from collections import Counter
from typing import Union
from hashlib import sha1
from hashlib import sha256
from hashlib import sha512
from hashlib import md5
import functools
import copy as copymod
import ordlookup
codecs.register_error("backslashreplace_", codecs.lookup_error("backslashreplace"))
long = int
# lru_cache with a shallow copy of the objects returned (list, dicts, ..)
# we don't use deepcopy as it's _really_ slow and the data we retrieved using
# this is enough with copy.copy taken from
# https://stackoverflow.com/questions/54909357
def lru_cache(maxsize=128, typed=False, copy=False):
if not copy:
return functools.lru_cache(maxsize, typed)
def decorator(f):
cached_func = functools.lru_cache(maxsize, typed)(f)
@functools.wraps(f)
def wrapper(*args, **kwargs):
# return copymod.deepcopy(cached_func(*args, **kwargs))
return copymod.copy(cached_func(*args, **kwargs))
return wrapper
return decorator
@lru_cache(maxsize=2048)
def cache_adjust_FileAlignment(val, file_alignment):
if file_alignment < FILE_ALIGNMENT_HARDCODED_VALUE:
return val
return (int(val / 0x200)) * 0x200
@lru_cache(maxsize=2048)
def cache_adjust_SectionAlignment(val, section_alignment, file_alignment):
if section_alignment < 0x1000: # page size
section_alignment = file_alignment
# 0x200 is the minimum valid FileAlignment according to the documentation
# although ntoskrnl.exe has an alignment of 0x80 in some Windows versions
#
# elif section_alignment < 0x80:
# section_alignment = 0x80
if section_alignment and val % section_alignment:
return section_alignment * (int(val / section_alignment))
return val
def count_zeroes(data):
return data.count(0)
fast_load = False
# This will set a maximum length of a string to be retrieved from the file.
# It's there to prevent loading massive amounts of data from memory mapped
# files. Strings longer than 1MB should be rather rare.
MAX_STRING_LENGTH = 0x100000 # 2^20
# Maximum number of imports to parse.
MAX_IMPORT_SYMBOLS = 0x2000
# Limit maximum length for specific string types separately
MAX_IMPORT_NAME_LENGTH = 0x200
MAX_DLL_LENGTH = 0x200
MAX_SYMBOL_NAME_LENGTH = 0x200
# Limit maximum number of sections before processing of sections will stop
MAX_SECTIONS = 0x800
# The global maximum number of resource entries to parse per file
MAX_RESOURCE_ENTRIES = 0x8000
# The maximum depth of nested resource tables
MAX_RESOURCE_DEPTH = 32
# Limit number of exported symbols
MAX_SYMBOL_EXPORT_COUNT = 0x2000
IMAGE_DOS_SIGNATURE = 0x5A4D
IMAGE_DOSZM_SIGNATURE = 0x4D5A
IMAGE_NE_SIGNATURE = 0x454E
IMAGE_LE_SIGNATURE = 0x454C
IMAGE_LX_SIGNATURE = 0x584C
IMAGE_TE_SIGNATURE = 0x5A56 # Terse Executables have a 'VZ' signature
IMAGE_NT_SIGNATURE = 0x00004550
IMAGE_NUMBEROF_DIRECTORY_ENTRIES = 16
IMAGE_ORDINAL_FLAG = 0x80000000
IMAGE_ORDINAL_FLAG64 = 0x8000000000000000
OPTIONAL_HEADER_MAGIC_PE = 0x10B
OPTIONAL_HEADER_MAGIC_PE_PLUS = 0x20B
def two_way_dict(pairs):
return dict([(e[1], e[0]) for e in pairs] + pairs)
directory_entry_types = [
("IMAGE_DIRECTORY_ENTRY_EXPORT", 0),
("IMAGE_DIRECTORY_ENTRY_IMPORT", 1),
("IMAGE_DIRECTORY_ENTRY_RESOURCE", 2),
("IMAGE_DIRECTORY_ENTRY_EXCEPTION", 3),
("IMAGE_DIRECTORY_ENTRY_SECURITY", 4),
("IMAGE_DIRECTORY_ENTRY_BASERELOC", 5),
("IMAGE_DIRECTORY_ENTRY_DEBUG", 6),
# Architecture on non-x86 platforms
("IMAGE_DIRECTORY_ENTRY_COPYRIGHT", 7),
("IMAGE_DIRECTORY_ENTRY_GLOBALPTR", 8),
("IMAGE_DIRECTORY_ENTRY_TLS", 9),
("IMAGE_DIRECTORY_ENTRY_LOAD_CONFIG", 10),
("IMAGE_DIRECTORY_ENTRY_BOUND_IMPORT", 11),
("IMAGE_DIRECTORY_ENTRY_IAT", 12),
("IMAGE_DIRECTORY_ENTRY_DELAY_IMPORT", 13),
("IMAGE_DIRECTORY_ENTRY_COM_DESCRIPTOR", 14),
("IMAGE_DIRECTORY_ENTRY_RESERVED", 15),
]
DIRECTORY_ENTRY = two_way_dict(directory_entry_types)
image_characteristics = [
("IMAGE_FILE_RELOCS_STRIPPED", 0x0001),
("IMAGE_FILE_EXECUTABLE_IMAGE", 0x0002),
("IMAGE_FILE_LINE_NUMS_STRIPPED", 0x0004),
("IMAGE_FILE_LOCAL_SYMS_STRIPPED", 0x0008),
("IMAGE_FILE_AGGRESIVE_WS_TRIM", 0x0010),
("IMAGE_FILE_LARGE_ADDRESS_AWARE", 0x0020),
("IMAGE_FILE_16BIT_MACHINE", 0x0040),
("IMAGE_FILE_BYTES_REVERSED_LO", 0x0080),
("IMAGE_FILE_32BIT_MACHINE", 0x0100),
("IMAGE_FILE_DEBUG_STRIPPED", 0x0200),
("IMAGE_FILE_REMOVABLE_RUN_FROM_SWAP", 0x0400),
("IMAGE_FILE_NET_RUN_FROM_SWAP", 0x0800),
("IMAGE_FILE_SYSTEM", 0x1000),
("IMAGE_FILE_DLL", 0x2000),
("IMAGE_FILE_UP_SYSTEM_ONLY", 0x4000),
("IMAGE_FILE_BYTES_REVERSED_HI", 0x8000),
]
IMAGE_CHARACTERISTICS = two_way_dict(image_characteristics)
section_characteristics = [
("IMAGE_SCN_TYPE_REG", 0x00000000), # reserved
("IMAGE_SCN_TYPE_DSECT", 0x00000001), # reserved
("IMAGE_SCN_TYPE_NOLOAD", 0x00000002), # reserved
("IMAGE_SCN_TYPE_GROUP", 0x00000004), # reserved
("IMAGE_SCN_TYPE_NO_PAD", 0x00000008), # reserved
("IMAGE_SCN_TYPE_COPY", 0x00000010), # reserved
("IMAGE_SCN_CNT_CODE", 0x00000020),
("IMAGE_SCN_CNT_INITIALIZED_DATA", 0x00000040),
("IMAGE_SCN_CNT_UNINITIALIZED_DATA", 0x00000080),
("IMAGE_SCN_LNK_OTHER", 0x00000100),
("IMAGE_SCN_LNK_INFO", 0x00000200),
("IMAGE_SCN_LNK_OVER", 0x00000400), # reserved
("IMAGE_SCN_LNK_REMOVE", 0x00000800),
("IMAGE_SCN_LNK_COMDAT", 0x00001000),
("IMAGE_SCN_MEM_PROTECTED", 0x00004000), # obsolete
("IMAGE_SCN_NO_DEFER_SPEC_EXC", 0x00004000),
("IMAGE_SCN_GPREL", 0x00008000),
("IMAGE_SCN_MEM_FARDATA", 0x00008000),
("IMAGE_SCN_MEM_SYSHEAP", 0x00010000), # obsolete
("IMAGE_SCN_MEM_PURGEABLE", 0x00020000),
("IMAGE_SCN_MEM_16BIT", 0x00020000),
("IMAGE_SCN_MEM_LOCKED", 0x00040000),
("IMAGE_SCN_MEM_PRELOAD", 0x00080000),
("IMAGE_SCN_ALIGN_1BYTES", 0x00100000),
("IMAGE_SCN_ALIGN_2BYTES", 0x00200000),
("IMAGE_SCN_ALIGN_4BYTES", 0x00300000),
("IMAGE_SCN_ALIGN_8BYTES", 0x00400000),
("IMAGE_SCN_ALIGN_16BYTES", 0x00500000), # default alignment
("IMAGE_SCN_ALIGN_32BYTES", 0x00600000),
("IMAGE_SCN_ALIGN_64BYTES", 0x00700000),
("IMAGE_SCN_ALIGN_128BYTES", 0x00800000),
("IMAGE_SCN_ALIGN_256BYTES", 0x00900000),
("IMAGE_SCN_ALIGN_512BYTES", 0x00A00000),
("IMAGE_SCN_ALIGN_1024BYTES", 0x00B00000),
("IMAGE_SCN_ALIGN_2048BYTES", 0x00C00000),
("IMAGE_SCN_ALIGN_4096BYTES", 0x00D00000),
("IMAGE_SCN_ALIGN_8192BYTES", 0x00E00000),
("IMAGE_SCN_ALIGN_MASK", 0x00F00000),
("IMAGE_SCN_LNK_NRELOC_OVFL", 0x01000000),
("IMAGE_SCN_MEM_DISCARDABLE", 0x02000000),
("IMAGE_SCN_MEM_NOT_CACHED", 0x04000000),
("IMAGE_SCN_MEM_NOT_PAGED", 0x08000000),
("IMAGE_SCN_MEM_SHARED", 0x10000000),
("IMAGE_SCN_MEM_EXECUTE", 0x20000000),
("IMAGE_SCN_MEM_READ", 0x40000000),
("IMAGE_SCN_MEM_WRITE", 0x80000000),
]
SECTION_CHARACTERISTICS = two_way_dict(section_characteristics)
debug_types = [
("IMAGE_DEBUG_TYPE_UNKNOWN", 0),
("IMAGE_DEBUG_TYPE_COFF", 1),
("IMAGE_DEBUG_TYPE_CODEVIEW", 2),
("IMAGE_DEBUG_TYPE_FPO", 3),
("IMAGE_DEBUG_TYPE_MISC", 4),
("IMAGE_DEBUG_TYPE_EXCEPTION", 5),
("IMAGE_DEBUG_TYPE_FIXUP", 6),
("IMAGE_DEBUG_TYPE_OMAP_TO_SRC", 7),
("IMAGE_DEBUG_TYPE_OMAP_FROM_SRC", 8),
("IMAGE_DEBUG_TYPE_BORLAND", 9),
("IMAGE_DEBUG_TYPE_RESERVED10", 10),
("IMAGE_DEBUG_TYPE_CLSID", 11),
("IMAGE_DEBUG_TYPE_VC_FEATURE", 12),
("IMAGE_DEBUG_TYPE_POGO", 13),
("IMAGE_DEBUG_TYPE_ILTCG", 14),
("IMAGE_DEBUG_TYPE_MPX", 15),
("IMAGE_DEBUG_TYPE_REPRO", 16),
("IMAGE_DEBUG_TYPE_EX_DLLCHARACTERISTICS", 20),
]
DEBUG_TYPE = two_way_dict(debug_types)
subsystem_types = [
("IMAGE_SUBSYSTEM_UNKNOWN", 0),
("IMAGE_SUBSYSTEM_NATIVE", 1),
("IMAGE_SUBSYSTEM_WINDOWS_GUI", 2),
("IMAGE_SUBSYSTEM_WINDOWS_CUI", 3),
("IMAGE_SUBSYSTEM_OS2_CUI", 5),
("IMAGE_SUBSYSTEM_POSIX_CUI", 7),
("IMAGE_SUBSYSTEM_NATIVE_WINDOWS", 8),
("IMAGE_SUBSYSTEM_WINDOWS_CE_GUI", 9),
("IMAGE_SUBSYSTEM_EFI_APPLICATION", 10),
("IMAGE_SUBSYSTEM_EFI_BOOT_SERVICE_DRIVER", 11),
("IMAGE_SUBSYSTEM_EFI_RUNTIME_DRIVER", 12),
("IMAGE_SUBSYSTEM_EFI_ROM", 13),
("IMAGE_SUBSYSTEM_XBOX", 14),
("IMAGE_SUBSYSTEM_WINDOWS_BOOT_APPLICATION", 16),
]
SUBSYSTEM_TYPE = two_way_dict(subsystem_types)
machine_types = [
("IMAGE_FILE_MACHINE_UNKNOWN", 0x0),
("IMAGE_FILE_MACHINE_I386", 0x014C),
("IMAGE_FILE_MACHINE_R3000", 0x0162),
("IMAGE_FILE_MACHINE_R4000", 0x0166),
("IMAGE_FILE_MACHINE_R10000", 0x0168),
("IMAGE_FILE_MACHINE_WCEMIPSV2", 0x0169),
("IMAGE_FILE_MACHINE_ALPHA", 0x0184),
("IMAGE_FILE_MACHINE_SH3", 0x01A2),
("IMAGE_FILE_MACHINE_SH3DSP", 0x01A3),
("IMAGE_FILE_MACHINE_SH3E", 0x01A4),
("IMAGE_FILE_MACHINE_SH4", 0x01A6),
("IMAGE_FILE_MACHINE_SH5", 0x01A8),
("IMAGE_FILE_MACHINE_ARM", 0x01C0),
("IMAGE_FILE_MACHINE_THUMB", 0x01C2),
("IMAGE_FILE_MACHINE_ARMNT", 0x01C4),
("IMAGE_FILE_MACHINE_AM33", 0x01D3),
("IMAGE_FILE_MACHINE_POWERPC", 0x01F0),
("IMAGE_FILE_MACHINE_POWERPCFP", 0x01F1),
("IMAGE_FILE_MACHINE_IA64", 0x0200),
("IMAGE_FILE_MACHINE_MIPS16", 0x0266),
("IMAGE_FILE_MACHINE_ALPHA64", 0x0284),
("IMAGE_FILE_MACHINE_AXP64", 0x0284), # same
("IMAGE_FILE_MACHINE_MIPSFPU", 0x0366),
("IMAGE_FILE_MACHINE_MIPSFPU16", 0x0466),
("IMAGE_FILE_MACHINE_TRICORE", 0x0520),
("IMAGE_FILE_MACHINE_CEF", 0x0CEF),
("IMAGE_FILE_MACHINE_EBC", 0x0EBC),
("IMAGE_FILE_MACHINE_RISCV32", 0x5032),
("IMAGE_FILE_MACHINE_RISCV64", 0x5064),
("IMAGE_FILE_MACHINE_RISCV128", 0x5128),
("IMAGE_FILE_MACHINE_LOONGARCH32", 0x6232),
("IMAGE_FILE_MACHINE_LOONGARCH64", 0x6264),
("IMAGE_FILE_MACHINE_AMD64", 0x8664),
("IMAGE_FILE_MACHINE_M32R", 0x9041),
("IMAGE_FILE_MACHINE_ARM64", 0xAA64),
("IMAGE_FILE_MACHINE_CEE", 0xC0EE),
]
MACHINE_TYPE = two_way_dict(machine_types)
relocation_types = [
("IMAGE_REL_BASED_ABSOLUTE", 0),
("IMAGE_REL_BASED_HIGH", 1),
("IMAGE_REL_BASED_LOW", 2),
("IMAGE_REL_BASED_HIGHLOW", 3),
("IMAGE_REL_BASED_HIGHADJ", 4),
("IMAGE_REL_BASED_MIPS_JMPADDR", 5),
("IMAGE_REL_BASED_SECTION", 6),
("IMAGE_REL_BASED_REL", 7),
("IMAGE_REL_BASED_MIPS_JMPADDR16", 9),
("IMAGE_REL_BASED_IA64_IMM64", 9),
("IMAGE_REL_BASED_DIR64", 10),
("IMAGE_REL_BASED_HIGH3ADJ", 11),
]
RELOCATION_TYPE = two_way_dict(relocation_types)
dll_characteristics = [
("IMAGE_LIBRARY_PROCESS_INIT", 0x0001), # reserved
("IMAGE_LIBRARY_PROCESS_TERM", 0x0002), # reserved
("IMAGE_LIBRARY_THREAD_INIT", 0x0004), # reserved
("IMAGE_LIBRARY_THREAD_TERM", 0x0008), # reserved
("IMAGE_DLLCHARACTERISTICS_HIGH_ENTROPY_VA", 0x0020),
("IMAGE_DLLCHARACTERISTICS_DYNAMIC_BASE", 0x0040),
("IMAGE_DLLCHARACTERISTICS_FORCE_INTEGRITY", 0x0080),
("IMAGE_DLLCHARACTERISTICS_NX_COMPAT", 0x0100),
("IMAGE_DLLCHARACTERISTICS_NO_ISOLATION", 0x0200),
("IMAGE_DLLCHARACTERISTICS_NO_SEH", 0x0400),
("IMAGE_DLLCHARACTERISTICS_NO_BIND", 0x0800),
("IMAGE_DLLCHARACTERISTICS_APPCONTAINER", 0x1000),
("IMAGE_DLLCHARACTERISTICS_WDM_DRIVER", 0x2000),
("IMAGE_DLLCHARACTERISTICS_GUARD_CF", 0x4000),
("IMAGE_DLLCHARACTERISTICS_TERMINAL_SERVER_AWARE", 0x8000),
]
DLL_CHARACTERISTICS = two_way_dict(dll_characteristics)
FILE_ALIGNMENT_HARDCODED_VALUE = 0x200
# Unwind info-related enums
unwind_info_flags = [
("UNW_FLAG_EHANDLER", 0x01),
("UNW_FLAG_UHANDLER", 0x02),
("UNW_FLAG_CHAININFO", 0x04),
]
UNWIND_INFO_FLAGS = two_way_dict(unwind_info_flags)
registers = [
("RAX", 0),
("RCX", 1),
("RDX", 2),
("RBX", 3),
("RSP", 4),
("RBP", 5),
("RSI", 6),
("RDI", 7),
("R8", 8),
("R9", 9),
("R10", 10),
("R11", 11),
("R12", 12),
("R13", 13),
("R14", 14),
("R15", 15),
]
REGISTERS = two_way_dict(registers)
# enum _UNWIND_OP_CODES
UWOP_PUSH_NONVOL = 0
UWOP_ALLOC_LARGE = 1
UWOP_ALLOC_SMALL = 2
UWOP_SET_FPREG = 3
UWOP_SAVE_NONVOL = 4
UWOP_SAVE_NONVOL_FAR = 5
UWOP_EPILOG = 6
UWOP_SAVE_XMM128 = 8
UWOP_SAVE_XMM128_FAR = 9
UWOP_PUSH_MACHFRAME = 10
# Resource types
resource_type = [
("RT_CURSOR", 1),
("RT_BITMAP", 2),
("RT_ICON", 3),
("RT_MENU", 4),
("RT_DIALOG", 5),
("RT_STRING", 6),
("RT_FONTDIR", 7),
("RT_FONT", 8),
("RT_ACCELERATOR", 9),
("RT_RCDATA", 10),
("RT_MESSAGETABLE", 11),
("RT_GROUP_CURSOR", 12),
("RT_GROUP_ICON", 14),
("RT_VERSION", 16),
("RT_DLGINCLUDE", 17),
("RT_PLUGPLAY", 19),
("RT_VXD", 20),
("RT_ANICURSOR", 21),
("RT_ANIICON", 22),
("RT_HTML", 23),
("RT_MANIFEST", 24),
]
RESOURCE_TYPE = two_way_dict(resource_type)
# Language definitions
lang = [
("LANG_NEUTRAL", 0x00),
("LANG_INVARIANT", 0x7F),
("LANG_AFRIKAANS", 0x36),
("LANG_ALBANIAN", 0x1C),
("LANG_ARABIC", 0x01),
("LANG_ARMENIAN", 0x2B),
("LANG_ASSAMESE", 0x4D),
("LANG_AZERI", 0x2C),
("LANG_BASQUE", 0x2D),
("LANG_BELARUSIAN", 0x23),
("LANG_BENGALI", 0x45),
("LANG_BULGARIAN", 0x02),
("LANG_CATALAN", 0x03),
("LANG_CHINESE", 0x04),
("LANG_CROATIAN", 0x1A),
("LANG_CZECH", 0x05),
("LANG_DANISH", 0x06),
("LANG_DIVEHI", 0x65),
("LANG_DUTCH", 0x13),
("LANG_ENGLISH", 0x09),
("LANG_ESTONIAN", 0x25),
("LANG_FAEROESE", 0x38),
("LANG_FARSI", 0x29),
("LANG_FINNISH", 0x0B),
("LANG_FRENCH", 0x0C),
("LANG_GALICIAN", 0x56),
("LANG_GEORGIAN", 0x37),
("LANG_GERMAN", 0x07),
("LANG_GREEK", 0x08),
("LANG_GUJARATI", 0x47),
("LANG_HEBREW", 0x0D),
("LANG_HINDI", 0x39),
("LANG_HUNGARIAN", 0x0E),
("LANG_ICELANDIC", 0x0F),
("LANG_INDONESIAN", 0x21),
("LANG_ITALIAN", 0x10),
("LANG_JAPANESE", 0x11),
("LANG_KANNADA", 0x4B),
("LANG_KASHMIRI", 0x60),
("LANG_KAZAK", 0x3F),
("LANG_KONKANI", 0x57),
("LANG_KOREAN", 0x12),
("LANG_KYRGYZ", 0x40),
("LANG_LATVIAN", 0x26),
("LANG_LITHUANIAN", 0x27),
("LANG_MACEDONIAN", 0x2F),
("LANG_MALAY", 0x3E),
("LANG_MALAYALAM", 0x4C),
("LANG_MANIPURI", 0x58),
("LANG_MARATHI", 0x4E),
("LANG_MONGOLIAN", 0x50),
("LANG_NEPALI", 0x61),
("LANG_NORWEGIAN", 0x14),
("LANG_ORIYA", 0x48),
("LANG_POLISH", 0x15),
("LANG_PORTUGUESE", 0x16),
("LANG_PUNJABI", 0x46),
("LANG_ROMANIAN", 0x18),
("LANG_RUSSIAN", 0x19),
("LANG_SANSKRIT", 0x4F),
("LANG_SERBIAN", 0x1A),
("LANG_SINDHI", 0x59),
("LANG_SLOVAK", 0x1B),
("LANG_SLOVENIAN", 0x24),
("LANG_SPANISH", 0x0A),
("LANG_SWAHILI", 0x41),
("LANG_SWEDISH", 0x1D),
("LANG_SYRIAC", 0x5A),
("LANG_TAMIL", 0x49),
("LANG_TATAR", 0x44),
("LANG_TELUGU", 0x4A),
("LANG_THAI", 0x1E),
("LANG_TURKISH", 0x1F),
("LANG_UKRAINIAN", 0x22),
("LANG_URDU", 0x20),
("LANG_UZBEK", 0x43),
("LANG_VIETNAMESE", 0x2A),
("LANG_GAELIC", 0x3C),
("LANG_MALTESE", 0x3A),
("LANG_MAORI", 0x28),
("LANG_RHAETO_ROMANCE", 0x17),
("LANG_SAAMI", 0x3B),
("LANG_SORBIAN", 0x2E),
("LANG_SUTU", 0x30),
("LANG_TSONGA", 0x31),
("LANG_TSWANA", 0x32),
("LANG_VENDA", 0x33),
("LANG_XHOSA", 0x34),
("LANG_ZULU", 0x35),
("LANG_ESPERANTO", 0x8F),
("LANG_WALON", 0x90),
("LANG_CORNISH", 0x91),
("LANG_WELSH", 0x92),
("LANG_BRETON", 0x93),
]
LANG = two_way_dict(lang)
# Sublanguage definitions
sublang = [
("SUBLANG_NEUTRAL", 0x00),
("SUBLANG_DEFAULT", 0x01),
("SUBLANG_SYS_DEFAULT", 0x02),
("SUBLANG_ARABIC_SAUDI_ARABIA", 0x01),
("SUBLANG_ARABIC_IRAQ", 0x02),
("SUBLANG_ARABIC_EGYPT", 0x03),
("SUBLANG_ARABIC_LIBYA", 0x04),
("SUBLANG_ARABIC_ALGERIA", 0x05),
("SUBLANG_ARABIC_MOROCCO", 0x06),
("SUBLANG_ARABIC_TUNISIA", 0x07),
("SUBLANG_ARABIC_OMAN", 0x08),
("SUBLANG_ARABIC_YEMEN", 0x09),
("SUBLANG_ARABIC_SYRIA", 0x0A),
("SUBLANG_ARABIC_JORDAN", 0x0B),
("SUBLANG_ARABIC_LEBANON", 0x0C),
("SUBLANG_ARABIC_KUWAIT", 0x0D),
("SUBLANG_ARABIC_UAE", 0x0E),
("SUBLANG_ARABIC_BAHRAIN", 0x0F),
("SUBLANG_ARABIC_QATAR", 0x10),
("SUBLANG_AZERI_LATIN", 0x01),
("SUBLANG_AZERI_CYRILLIC", 0x02),
("SUBLANG_CHINESE_TRADITIONAL", 0x01),
("SUBLANG_CHINESE_SIMPLIFIED", 0x02),
("SUBLANG_CHINESE_HONGKONG", 0x03),
("SUBLANG_CHINESE_SINGAPORE", 0x04),
("SUBLANG_CHINESE_MACAU", 0x05),
("SUBLANG_DUTCH", 0x01),
("SUBLANG_DUTCH_BELGIAN", 0x02),
("SUBLANG_ENGLISH_US", 0x01),
("SUBLANG_ENGLISH_UK", 0x02),
("SUBLANG_ENGLISH_AUS", 0x03),
("SUBLANG_ENGLISH_CAN", 0x04),
("SUBLANG_ENGLISH_NZ", 0x05),
("SUBLANG_ENGLISH_EIRE", 0x06),
("SUBLANG_ENGLISH_SOUTH_AFRICA", 0x07),
("SUBLANG_ENGLISH_JAMAICA", 0x08),
("SUBLANG_ENGLISH_CARIBBEAN", 0x09),
("SUBLANG_ENGLISH_BELIZE", 0x0A),
("SUBLANG_ENGLISH_TRINIDAD", 0x0B),
("SUBLANG_ENGLISH_ZIMBABWE", 0x0C),
("SUBLANG_ENGLISH_PHILIPPINES", 0x0D),
("SUBLANG_FRENCH", 0x01),
("SUBLANG_FRENCH_BELGIAN", 0x02),
("SUBLANG_FRENCH_CANADIAN", 0x03),
("SUBLANG_FRENCH_SWISS", 0x04),
("SUBLANG_FRENCH_LUXEMBOURG", 0x05),
("SUBLANG_FRENCH_MONACO", 0x06),
("SUBLANG_GERMAN", 0x01),
("SUBLANG_GERMAN_SWISS", 0x02),
("SUBLANG_GERMAN_AUSTRIAN", 0x03),
("SUBLANG_GERMAN_LUXEMBOURG", 0x04),
("SUBLANG_GERMAN_LIECHTENSTEIN", 0x05),
("SUBLANG_ITALIAN", 0x01),
("SUBLANG_ITALIAN_SWISS", 0x02),
("SUBLANG_KASHMIRI_SASIA", 0x02),
("SUBLANG_KASHMIRI_INDIA", 0x02),
("SUBLANG_KOREAN", 0x01),
("SUBLANG_LITHUANIAN", 0x01),
("SUBLANG_MALAY_MALAYSIA", 0x01),
("SUBLANG_MALAY_BRUNEI_DARUSSALAM", 0x02),
("SUBLANG_NEPALI_INDIA", 0x02),
("SUBLANG_NORWEGIAN_BOKMAL", 0x01),
("SUBLANG_NORWEGIAN_NYNORSK", 0x02),
("SUBLANG_PORTUGUESE", 0x02),
("SUBLANG_PORTUGUESE_BRAZILIAN", 0x01),
("SUBLANG_SERBIAN_LATIN", 0x02),
("SUBLANG_SERBIAN_CYRILLIC", 0x03),
("SUBLANG_SPANISH", 0x01),
("SUBLANG_SPANISH_MEXICAN", 0x02),
("SUBLANG_SPANISH_MODERN", 0x03),
("SUBLANG_SPANISH_GUATEMALA", 0x04),
("SUBLANG_SPANISH_COSTA_RICA", 0x05),
("SUBLANG_SPANISH_PANAMA", 0x06),
("SUBLANG_SPANISH_DOMINICAN_REPUBLIC", 0x07),
("SUBLANG_SPANISH_VENEZUELA", 0x08),
("SUBLANG_SPANISH_COLOMBIA", 0x09),
("SUBLANG_SPANISH_PERU", 0x0A),
("SUBLANG_SPANISH_ARGENTINA", 0x0B),
("SUBLANG_SPANISH_ECUADOR", 0x0C),
("SUBLANG_SPANISH_CHILE", 0x0D),
("SUBLANG_SPANISH_URUGUAY", 0x0E),
("SUBLANG_SPANISH_PARAGUAY", 0x0F),
("SUBLANG_SPANISH_BOLIVIA", 0x10),
("SUBLANG_SPANISH_EL_SALVADOR", 0x11),
("SUBLANG_SPANISH_HONDURAS", 0x12),
("SUBLANG_SPANISH_NICARAGUA", 0x13),
("SUBLANG_SPANISH_PUERTO_RICO", 0x14),
("SUBLANG_SWEDISH", 0x01),
("SUBLANG_SWEDISH_FINLAND", 0x02),
("SUBLANG_URDU_PAKISTAN", 0x01),
("SUBLANG_URDU_INDIA", 0x02),
("SUBLANG_UZBEK_LATIN", 0x01),
("SUBLANG_UZBEK_CYRILLIC", 0x02),
("SUBLANG_DUTCH_SURINAM", 0x03),
("SUBLANG_ROMANIAN", 0x01),
("SUBLANG_ROMANIAN_MOLDAVIA", 0x02),
("SUBLANG_RUSSIAN", 0x01),
("SUBLANG_RUSSIAN_MOLDAVIA", 0x02),
("SUBLANG_CROATIAN", 0x01),
("SUBLANG_LITHUANIAN_CLASSIC", 0x02),
("SUBLANG_GAELIC", 0x01),
("SUBLANG_GAELIC_SCOTTISH", 0x02),
("SUBLANG_GAELIC_MANX", 0x03),
]
SUBLANG = two_way_dict(sublang)
# Initialize the dictionary with all the name->value pairs
SUBLANG = dict(sublang)
# Now add all the value->name information, handling duplicates appropriately
for sublang_name, sublang_value in sublang:
if sublang_value in SUBLANG:
SUBLANG[sublang_value].append(sublang_name)
else:
SUBLANG[sublang_value] = [sublang_name]
# Resolve a sublang name given the main lang name
#
def get_sublang_name_for_lang(lang_value, sublang_value):
lang_name = LANG.get(lang_value, "*unknown*")
for sublang_name in SUBLANG.get(sublang_value, []):
# if the main language is a substring of sublang's name, then
# return that
if lang_name in sublang_name:
return sublang_name
# otherwise return the first sublang name
return SUBLANG.get(sublang_value, ["*unknown*"])[0]
# Ange Albertini's code to process resources' strings
#
def parse_strings(data, counter, l):
i = 0
error_count = 0
while i < len(data):
data_slice = data[i : i + 2]
if len(data_slice) < 2:
break
len_ = struct.unpack("<h", data_slice)[0]
i += 2
if len_ != 0 and 0 <= len_ * 2 <= len(data):
try:
l[counter] = b(data[i : i + len_ * 2]).decode("utf-16le")
except UnicodeDecodeError:
error_count += 1
pass
if error_count >= 3:
break
i += len_ * 2
counter += 1
def retrieve_flags(flag_dict, flag_filter):
"""Read the flags from a dictionary and return them in a usable form.
Will return a list of (flag, value) for all flags in "flag_dict"
matching the filter "flag_filter".
"""
return [
(flag, flag_dict[flag])
for flag in flag_dict.keys()
if isinstance(flag, (str, bytes)) and flag.startswith(flag_filter)
]
def set_flags(obj, flag_field, flags):
"""Will process the flags and set attributes in the object accordingly.
The object "obj" will gain attributes named after the flags provided in
"flags" and valued True/False, matching the results of applying each
flag value from "flags" to flag_field.
"""
for flag, value in flags:
if value & flag_field:
obj.__dict__[flag] = True
else:
obj.__dict__[flag] = False
def power_of_two(val):
return val != 0 and (val & (val - 1)) == 0
def b(x):
if isinstance(x, bytes):
return x
elif isinstance(x, bytearray):
return bytes(x)
else:
return codecs.encode(x, "cp1252")
class AddressSet(set):
def __init__(self):
super().__init__()
self.min = None
self.max = None
def add(self, value):
super().add(value)
self.min = value if self.min is None else min(self.min, value)
self.max = value if self.max is None else max(self.max, value)
def diff(self):
return 0 if self.min is None or self.max is None else self.max - self.min
class UnicodeStringWrapperPostProcessor:
"""This class attempts to help the process of identifying strings
that might be plain Unicode or Pascal. A list of strings will be
wrapped on it with the hope the overlappings will help make the
decision about their type."""
def __init__(self, pe, rva_ptr):
self.pe = pe
self.rva_ptr = rva_ptr
self.string = None
def get_rva(self):
"""Get the RVA of the string."""
return self.rva_ptr
def __str__(self):
"""Return the escaped UTF-8 representation of the string."""
return self.decode("utf-8", "backslashreplace_")
def decode(self, *args):
if not self.string:
return ""
return self.string.decode(*args)
def invalidate(self):
"""Make this instance None, to express it's no known string type."""
self = None
def render_pascal_16(self):
try:
self.string = self.pe.get_string_u_at_rva(
self.rva_ptr + 2, max_length=self.get_pascal_16_length()
)
except PEFormatError:
self.pe.get_warnings().append(
"Failed rendering pascal string, "
"attempting to read from RVA 0x{0:x}".format(self.rva_ptr + 2)
)
def get_pascal_16_length(self):
return self.__get_word_value_at_rva(self.rva_ptr)
def __get_word_value_at_rva(self, rva):
try:
data = self.pe.get_data(rva, 2)
except PEFormatError:
return False
if len(data) < 2:
return False
return struct.unpack("<H", data)[0]
def ask_unicode_16(self, next_rva_ptr):
"""The next RVA is taken to be the one immediately following this one.
Such RVA could indicate the natural end of the string and will be checked
to see if there's a Unicode NULL character there.
"""
if self.__get_word_value_at_rva(next_rva_ptr - 2) == 0:
self.length = next_rva_ptr - self.rva_ptr
return True
return False
def render_unicode_16(self):
try:
self.string = self.pe.get_string_u_at_rva(self.rva_ptr)
except PEFormatError:
self.pe.get_warnings().append(
"Failed rendering unicode string, "
"attempting to read from RVA 0x{0:x}".format(self.rva_ptr)
)
class PEFormatError(Exception):
"""Generic PE format error exception."""
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
class Dump:
"""Convenience class for dumping the PE information."""
def __init__(self):
self.text = []
def add_lines(self, txt, indent=0):
"""Adds a list of lines.
The list can be indented with the optional argument 'indent'.
"""
for line in txt:
self.add_line(line, indent)
def add_line(self, txt, indent=0):
"""Adds a line.
The line can be indented with the optional argument 'indent'.
"""
self.add(txt + "\n", indent)
def add(self, txt, indent=0):
"""Adds some text, no newline will be appended.
The text can be indented with the optional argument 'indent'.
"""
self.text.append("{0}{1}".format(" " * indent, txt))
def add_header(self, txt):
"""Adds a header element."""
self.add_line("{0}{1}{0}\n".format("-" * 10, txt))
def add_newline(self):
"""Adds a newline."""
self.text.append("\n")
def get_text(self):
"""Get the text in its current state."""
return "".join("{0}".format(b) for b in self.text)
STRUCT_SIZEOF_TYPES = {
"x": 1,
"c": 1,
"b": 1,
"B": 1,
"h": 2,
"H": 2,
"i": 4,
"I": 4,
"l": 4,
"L": 4,
"f": 4,
"q": 8,
"Q": 8,
"d": 8,
"s": 1,
}
@lru_cache(maxsize=2048)
def sizeof_type(t):
count = 1
_t = t
if t[0] in string.digits:
# extract the count
count = int("".join([d for d in t if d in string.digits]))
_t = "".join([d for d in t if d not in string.digits])
return STRUCT_SIZEOF_TYPES[_t] * count
@lru_cache(maxsize=2048, copy=True)
def set_format(format):
__format_str__ = "<"
__unpacked_data_elms__ = []
__field_offsets__ = {}
__keys__ = []
__format_length__ = 0
offset = 0
for elm in format:
if "," in elm:
elm_type, elm_name = elm.split(",", 1)
__format_str__ += elm_type
__unpacked_data_elms__.append(None)
elm_names = elm_name.split(",")
names = []
for elm_name in elm_names:
if elm_name in __keys__:
search_list = [x[: len(elm_name)] for x in __keys__]
occ_count = search_list.count(elm_name)
elm_name = "{0}_{1:d}".format(elm_name, occ_count)
names.append(elm_name)
__field_offsets__[elm_name] = offset
offset += sizeof_type(elm_type)
# Some PE header structures have unions on them, so a certain
# value might have different names, so each key has a list of
# all the possible members referring to the data.
__keys__.append(names)
__format_length__ = struct.calcsize(__format_str__)
return (
__format_str__,
__unpacked_data_elms__,
__field_offsets__,
__keys__,
__format_length__,
)
class Structure:
"""Prepare structure object to extract members from data.
Format is a list containing definitions for the elements
of the structure.
"""
def __init__(self, format, name=None, file_offset=None):
# Format is forced little endian, for big endian non Intel platforms
self.__format_str__ = "<"
self.__keys__ = []
self.__format_length__ = 0
self.__field_offsets__ = {}
self.__unpacked_data_elms__ = []
d = format[1]
# need a tuple to be hashable in set_format using lru cache
if not isinstance(d, tuple):
d = tuple(d)
(
self.__format_str__,
self.__unpacked_data_elms__,
self.__field_offsets__,
self.__keys__,
self.__format_length__,
) = set_format(d)
self.__all_zeroes__ = False
self.__file_offset__ = file_offset
if name:
self.name = name
else:
self.name = format[0]
def __get_format__(self) -> str:
return self.__format_str__
def get_field_absolute_offset(self, field_name):
"""Return the offset within the field for the requested field in the structure."""
return self.__file_offset__ + self.__field_offsets__[field_name]
def get_field_relative_offset(self, field_name):
"""Return the offset within the structure for the requested field."""
return self.__field_offsets__[field_name]
def get_file_offset(self):
return self.__file_offset__
def set_file_offset(self, offset):
self.__file_offset__ = offset
def all_zeroes(self):
"""Returns true is the unpacked data is all zeros."""
return self.__all_zeroes__
def sizeof(self):
"""Return size of the structure."""
return self.__format_length__
def __unpack__(self, data):
data = b(data)
if len(data) > self.__format_length__:
data = data[: self.__format_length__]
# OC Patch:
# Some malware have incorrect header lengths.
# Fail gracefully if this occurs
# Buggy malware: a29b0118af8b7408444df81701ad5a7f
#
elif len(data) < self.__format_length__:
raise PEFormatError("Data length less than expected header length.")
if count_zeroes(data) == len(data):
self.__all_zeroes__ = True
self.__unpacked_data_elms__ = struct.unpack(self.__format_str__, data)
for idx, val in enumerate(self.__unpacked_data_elms__):
for key in self.__keys__[idx]:
setattr(self, key, val)
def __pack__(self):
new_values = []
for idx, val in enumerate(self.__unpacked_data_elms__):
new_val = None
for key in self.__keys__[idx]:
new_val = getattr(self, key)
# In the case of unions, when the first changed value
# is picked the loop is exited
if new_val != val:
break
new_values.append(new_val)
return struct.pack(self.__format_str__, *new_values)
def __str__(self):
return "\n".join(self.dump())
def __repr__(self):
return "<Structure: %s>" % (
" ".join([" ".join(s.split()) for s in self.dump()])
)
def dump(self, indentation=0):
"""Returns a string representation of the structure."""
dump = []
dump.append("[{0}]".format(self.name))
printable_bytes = [
ord(i) for i in string.printable if i not in string.whitespace
]
# Refer to the __set_format__ method for an explanation
# of the following construct.
for keys in self.__keys__:
for key in keys:
val = getattr(self, key)
if isinstance(val, (int, long)):
if key.startswith("Signature_"):
val_str = "{:<8X}".format(val)
else:
val_str = "0x{:<8X}".format(val)
if key == "TimeDateStamp" or key == "dwTimeStamp":
try:
val_str += " [%s UTC]" % time.asctime(time.gmtime(val))
except ValueError:
val_str += " [INVALID TIME]"
else:
val_str = bytearray(val)
if key.startswith("Signature"):
val_str = "".join(
["{:02X}".format(i) for i in val_str.rstrip(b"\x00")]
)
else:
val_str = "".join(
[
chr(i)
if (i in printable_bytes)
else "\\x{0:02x}".format(i)
for i in val_str.rstrip(b"\x00")
]
)
dump.append(
"0x%-8X 0x%-3X %-30s %s"
% (
self.__field_offsets__[key] + self.__file_offset__,
self.__field_offsets__[key],
key + ":",
val_str,
)
)
return dump
def dump_dict(self):
"""Returns a dictionary representation of the structure."""
dump_dict = {}
dump_dict["Structure"] = self.name
# Refer to the __set_format__ method for an explanation
# of the following construct.
for keys in self.__keys__:
for key in keys:
val = getattr(self, key)
if isinstance(val, (int, long)):
if key == "TimeDateStamp" or key == "dwTimeStamp":
try:
val = "0x%-8X [%s UTC]" % (
val,
time.asctime(time.gmtime(val)),
)
except ValueError:
val = "0x%-8X [INVALID TIME]" % val
else:
val = "".join(
chr(d) if chr(d) in string.printable else "\\x%02x" % d
for d in [ord(c) if not isinstance(c, int) else c for c in val]
)
dump_dict[key] = {
"FileOffset": self.__field_offsets__[key] + self.__file_offset__,
"Offset": self.__field_offsets__[key],
"Value": val,
}
return dump_dict
class SectionStructure(Structure):
"""Convenience section handling class."""
def __init__(self, *argl, **argd):
if "pe" in argd:
self.pe = argd["pe"]
del argd["pe"]
self.PointerToRawData = None
self.VirtualAddress = None
self.SizeOfRawData = None
self.Misc_VirtualSize = None
Structure.__init__(self, *argl, **argd)
self.PointerToRawData_adj = None
self.VirtualAddress_adj = None
self.section_min_addr = None
self.section_max_addr = None
def get_PointerToRawData_adj(self):
if self.PointerToRawData_adj is None:
if self.PointerToRawData is not None:
self.PointerToRawData_adj = self.pe.adjust_FileAlignment(
self.PointerToRawData, self.pe.OPTIONAL_HEADER.FileAlignment
)
return self.PointerToRawData_adj
def get_VirtualAddress_adj(self):
if self.VirtualAddress_adj is None:
if self.VirtualAddress is not None:
self.VirtualAddress_adj = self.pe.adjust_SectionAlignment(
self.VirtualAddress,
self.pe.OPTIONAL_HEADER.SectionAlignment,
self.pe.OPTIONAL_HEADER.FileAlignment,
)
return self.VirtualAddress_adj
def get_data(self, start=None, length=None, ignore_padding=False):
"""Get data chunk from a section.
Allows to query data from the section by passing the
addresses where the PE file would be loaded by default.
It is then possible to retrieve code and data by their real
addresses as they would be if loaded.
Note that sections on disk can include padding that would
not be loaded to memory. That is the case if `section.SizeOfRawData`
is greater than `section.Misc_VirtualSize`, and that means
that data past `section.Misc_VirtualSize` is padding.
In case you are not interested in this padding, passing
`ignore_padding=True` will truncate the result in order
not to return the padding (if any).
Returns bytes() under Python 3.x and set() under Python 2.7
"""
if start is None:
offset = self.get_PointerToRawData_adj()
else:
offset = (
start - self.get_VirtualAddress_adj()
) + self.get_PointerToRawData_adj()
if length is not None:
end = offset + length
elif self.SizeOfRawData is not None:
end = offset + self.SizeOfRawData
else:
end = offset
if ignore_padding and end is not None and offset is not None:
end = min(end, offset + self.Misc_VirtualSize)
# PointerToRawData is not adjusted here as we might want to read any possible
# extra bytes that might get cut off by aligning the start (and hence cutting
# something off the end)
if self.PointerToRawData is not None and self.SizeOfRawData is not None:
if end > self.PointerToRawData + self.SizeOfRawData:
end = self.PointerToRawData + self.SizeOfRawData
return self.pe.__data__[offset:end]
def __setattr__(self, name, val):
if name == "Characteristics":
section_flags = retrieve_flags(SECTION_CHARACTERISTICS, "IMAGE_SCN_")
# Set the section's flags according to the Characteristics member
set_flags(self, val, section_flags)
elif "IMAGE_SCN_" in name and hasattr(self, name):
if val:
self.__dict__["Characteristics"] |= SECTION_CHARACTERISTICS[name]
else:
self.__dict__["Characteristics"] ^= SECTION_CHARACTERISTICS[name]
self.__dict__[name] = val
def get_rva_from_offset(self, offset):
return offset - self.get_PointerToRawData_adj() + self.get_VirtualAddress_adj()
def get_offset_from_rva(self, rva):
return rva - self.get_VirtualAddress_adj() + self.get_PointerToRawData_adj()
def contains_offset(self, offset):
"""Check whether the section contains the file offset provided."""
if self.PointerToRawData is None:
# bss and other sections containing only uninitialized data must have 0
# and do not take space in the file
return False
PointerToRawData_adj = self.get_PointerToRawData_adj()
return (
PointerToRawData_adj <= offset < PointerToRawData_adj + self.SizeOfRawData
)
def contains_rva(self, rva):
"""Check whether the section contains the address provided."""
# speedup
if self.section_min_addr is not None and self.section_max_addr is not None:
return self.section_min_addr <= rva < self.section_max_addr
VirtualAddress_adj = self.get_VirtualAddress_adj()
# Check if the SizeOfRawData is realistic. If it's bigger than the size of
# the whole PE file minus the start address of the section it could be
# either truncated or the SizeOfRawData contains a misleading value.
# In either of those cases we take the VirtualSize
#
if len(self.pe.__data__) - self.get_PointerToRawData_adj() < self.SizeOfRawData:
# PECOFF documentation v8 says:
# VirtualSize: The total size of the section when loaded into memory.
# If this value is greater than SizeOfRawData, the section is zero-padded.
# This field is valid only for executable images and should be set to zero
# for object files.
#
size = self.Misc_VirtualSize
else:
size = max(self.SizeOfRawData, self.Misc_VirtualSize)
# Check whether there's any section after the current one that starts before
# the calculated end for the current one. If so, cut the current section's size
# to fit in the range up to where the next section starts.
if (
self.next_section_virtual_address is not None
and self.next_section_virtual_address > self.VirtualAddress
and VirtualAddress_adj + size > self.next_section_virtual_address
):
size = self.next_section_virtual_address - VirtualAddress_adj
self.section_min_addr = VirtualAddress_adj
self.section_max_addr = VirtualAddress_adj + size
return VirtualAddress_adj <= rva < VirtualAddress_adj + size
def contains(self, rva):
return self.contains_rva(rva)
def get_entropy(self):
"""Calculate and return the entropy for the section."""
return self.entropy_H(self.get_data())
def get_hash_sha1(self):
"""Get the SHA-1 hex-digest of the section's data."""
if sha1 is not None:
return sha1(self.get_data()).hexdigest()
def get_hash_sha256(self):
"""Get the SHA-256 hex-digest of the section's data."""
if sha256 is not None:
return sha256(self.get_data()).hexdigest()
def get_hash_sha512(self):
"""Get the SHA-512 hex-digest of the section's data."""
if sha512 is not None:
return sha512(self.get_data()).hexdigest()
def get_hash_md5(self):
"""Get the MD5 hex-digest of the section's data."""
if md5 is not None:
return md5(self.get_data()).hexdigest()
def entropy_H(self, data):
"""Calculate the entropy of a chunk of data."""
if not data:
return 0.0
occurences = Counter(bytearray(data))
entropy = 0
for x in occurences.values():
p_x = float(x) / len(data)
entropy -= p_x * math.log(p_x, 2)
return entropy
@lru_cache(maxsize=2048, copy=False)
def set_bitfields_format(format):
class Accumulator:
def __init__(self, fmt, comp_fields):
self._subfields = []
# add a prefix to distinguish the artificially created compoud field
# from regular fields
self._name = "~"
self._type = None
self._bits_left = 0
self._comp_fields = comp_fields
self._format = fmt
def wrap_up(self):
if self._type is None:
return
self._format.append(self._type + "," + self._name)
self._comp_fields[len(self._format) - 1] = (self._type, self._subfields)
self._name = "~"
self._type = None
self._subfields = []
def new_type(self, tp):
self._bits_left = STRUCT_SIZEOF_TYPES[tp] * 8
self._type = tp
def add_subfield(self, name, bitcnt):
self._name += name
self._bits_left -= bitcnt
self._subfields.append((name, bitcnt))
def get_type(self):
return self._type
def get_name(self):
return self._name
def get_bits_left(self):
return self._bits_left
old_fmt = []
comp_fields = {}
ac = Accumulator(old_fmt, comp_fields)
for elm in format[1]:
if not ":" in elm:
ac.wrap_up()
old_fmt.append(elm)
continue
elm_type, elm_name = elm.split(",", 1)
if "," in elm_name:
raise NotImplementedError(
"Structures with bitfields do not support unions yet"
)
elm_type, elm_bits = elm_type.split(":", 1)
elm_bits = int(elm_bits)
if elm_type != ac.get_type() or elm_bits > ac.get_bits_left():
ac.wrap_up()
ac.new_type(elm_type)
ac.add_subfield(elm_name, elm_bits)
ac.wrap_up()
format_str, _, field_offsets, keys, format_length = set_format(tuple(old_fmt))
extended_keys = []
for idx, val in enumerate(keys):
if not idx in comp_fields:
extended_keys.append(val)
continue
_, sbf = comp_fields[idx]
bf_names = [[f[StructureWithBitfields.BTF_NAME_IDX]] for f in sbf]
extended_keys.extend(bf_names)
for n in bf_names:
field_offsets[n[0]] = field_offsets[val[0]]
return (format_str, format_length, field_offsets, keys, extended_keys, comp_fields)
class StructureWithBitfields(Structure):
"""
Extends Structure's functionality with support for bitfields such as:
('B:4,LowerHalf', 'B:4,UpperHalf')
To this end, two lists are maintained:
* self.__keys__ that contains compound fields, for example
('B,~LowerHalfUpperHalf'), and is used during packing/unpaking
* self.__keys_ext__ containing a separate key for each field (ex., LowerHalf,
UpperHalf) to simplify implementation of dump()
This way the implementation of unpacking/packing and dump() from Structure can be
reused.
In addition, we create a dictionary:
<comound_field_index_in_keys> -->
(data type, [ (subfield name, length in bits)+ ] )
that facilitates bitfield paking and unpacking.
With lru_cache() creating only once instance per format string, the memory
overhead is negligible.
"""
BTF_NAME_IDX = 0
BTF_BITCNT_IDX = 1
CF_TYPE_IDX = 0
CF_SUBFLD_IDX = 1
def __init__(self, format, name=None, file_offset=None):
(
self.__format_str__,
self.__format_length__,
self.__field_offsets__,
self.__keys__,
self.__keys_ext__,
self.__compound_fields__,
) = set_bitfields_format(format)
# create our own unpacked_data_elms to ensure they are not shared among
# StructureWithBitfields instances with the same format string
self.__unpacked_data_elms__ = [None for i in range(self.__format_length__)]
self.__all_zeroes__ = False
self.__file_offset__ = file_offset
self.name = name if name != None else format[0]
def __unpack__(self, data):
# calling the original routine to deal with special cases/spurious data
# structures
super(StructureWithBitfields, self).__unpack__(data)
self._unpack_bitfield_attributes()
def __pack__(self):
self._pack_bitfield_attributes()
try:
data = super(StructureWithBitfields, self).__pack__()
finally:
self._unpack_bitfield_attributes()
return data
def dump(self, indentation=0):
tk = self.__keys__
self.__keys__ = self.__keys_ext__
try:
ret = super(StructureWithBitfields, self).dump(indentation)
finally:
self.__keys__ = tk
return ret
def dump_dict(self):
tk = self.__keys__
self.__keys__ = self.__keys_ext__
try:
ret = super(StructureWithBitfields, self).dump_dict()
finally:
self.__keys__ = tk
return ret
def _unpack_bitfield_attributes(self):
"""Replace compound attributes corresponding to bitfields with separate
sub-fields.
"""
for i in self.__compound_fields__.keys():
cf_name = self.__keys__[i][0]
cval = getattr(self, cf_name)
delattr(self, cf_name)
offst = 0
for sf in self.__compound_fields__[i][StructureWithBitfields.CF_SUBFLD_IDX]:
mask = (1 << sf[StructureWithBitfields.BTF_BITCNT_IDX]) - 1
mask <<= offst
setattr(
self,
sf[StructureWithBitfields.BTF_NAME_IDX],
(cval & mask) >> offst,
)
offst += sf[StructureWithBitfields.BTF_BITCNT_IDX]
def _pack_bitfield_attributes(self):
"""Pack attributes into a compound bitfield"""
for i in self.__compound_fields__.keys():
cf_name = self.__keys__[i][0]
offst, acc_val = 0, 0
for sf in self.__compound_fields__[i][StructureWithBitfields.CF_SUBFLD_IDX]:
mask = (1 << sf[StructureWithBitfields.BTF_BITCNT_IDX]) - 1
field_val = (
getattr(self, sf[StructureWithBitfields.BTF_NAME_IDX]) & mask
)
acc_val |= field_val << offst
offst += sf[StructureWithBitfields.BTF_BITCNT_IDX]
setattr(self, cf_name, acc_val)
class DataContainer:
"""Generic data container."""
def __init__(self, **args):
bare_setattr = super(DataContainer, self).__setattr__
for key, value in args.items():
bare_setattr(key, value)
class ImportDescData(DataContainer):
"""Holds import descriptor information.
dll: name of the imported DLL
imports: list of imported symbols (ImportData instances)
struct: IMAGE_IMPORT_DESCRIPTOR structure
"""
class ImportData(DataContainer):
"""Holds imported symbol's information.
ordinal: Ordinal of the symbol
name: Name of the symbol
bound: If the symbol is bound, this contains
the address.
"""
def __setattr__(self, name, val):
# If the instance doesn't yet have an ordinal attribute
# it's not fully initialized so can't do any of the
# following
#
if (
hasattr(self, "ordinal")
and hasattr(self, "bound")
and hasattr(self, "name")
):
if name == "ordinal":
if self.pe.PE_TYPE == OPTIONAL_HEADER_MAGIC_PE:
ordinal_flag = IMAGE_ORDINAL_FLAG
elif self.pe.PE_TYPE == OPTIONAL_HEADER_MAGIC_PE_PLUS:
ordinal_flag = IMAGE_ORDINAL_FLAG64
# Set the ordinal and flag the entry as importing by ordinal
self.struct_table.Ordinal = ordinal_flag | (val & 0xFFFF)
self.struct_table.AddressOfData = self.struct_table.Ordinal
self.struct_table.Function = self.struct_table.Ordinal
self.struct_table.ForwarderString = self.struct_table.Ordinal
elif name == "bound":
if self.struct_iat is not None:
self.struct_iat.AddressOfData = val
self.struct_iat.AddressOfData = self.struct_iat.AddressOfData
self.struct_iat.Function = self.struct_iat.AddressOfData
self.struct_iat.ForwarderString = self.struct_iat.AddressOfData
elif name == "address":
self.struct_table.AddressOfData = val
self.struct_table.Ordinal = self.struct_table.AddressOfData
self.struct_table.Function = self.struct_table.AddressOfData
self.struct_table.ForwarderString = self.struct_table.AddressOfData
elif name == "name":
# Make sure we reset the entry in case the import had been set to
# import by ordinal
if self.name_offset:
name_rva = self.pe.get_rva_from_offset(self.name_offset)
self.pe.set_dword_at_offset(
self.ordinal_offset, (0 << 31) | name_rva
)
# Complain if the length of the new name is longer than the
# existing one
if len(val) > len(self.name):
raise PEFormatError(
"The export name provided is longer than the existing one."
)
pass
self.pe.set_bytes_at_offset(self.name_offset, val)
self.__dict__[name] = val
class ExportDirData(DataContainer):
"""Holds export directory information.
struct: IMAGE_EXPORT_DIRECTORY structure
symbols: list of exported symbols (ExportData instances)"""
class ExportData(DataContainer):
"""Holds exported symbols' information.
ordinal: ordinal of the symbol
address: address of the symbol
name: name of the symbol (None if the symbol is
exported by ordinal only)
forwarder: if the symbol is forwarded it will
contain the name of the target symbol,
None otherwise.
"""
def __setattr__(self, name, val):
# If the instance doesn't yet have an ordinal attribute
# it's not fully initialized so can't do any of the
# following
#
if (
hasattr(self, "ordinal")
and hasattr(self, "address")
and hasattr(self, "forwarder")
and hasattr(self, "name")
):
if name == "ordinal":
self.pe.set_word_at_offset(self.ordinal_offset, val)
elif name == "address":
self.pe.set_dword_at_offset(self.address_offset, val)
elif name == "name":
# Complain if the length of the new name is longer than the
# existing one
if len(val) > len(self.name):
raise PEFormatError(
"The export name provided is longer than the existing one."
)
self.pe.set_bytes_at_offset(self.name_offset, val)
elif name == "forwarder":
# Complain if the length of the new name is longer than the
# existing one
if len(val) > len(self.forwarder):
raise PEFormatError(
"The forwarder name provided is longer than the existing one."
)
self.pe.set_bytes_at_offset(self.forwarder_offset, val)
self.__dict__[name] = val
class ResourceDirData(DataContainer):
"""Holds resource directory information.
struct: IMAGE_RESOURCE_DIRECTORY structure
entries: list of entries (ResourceDirEntryData instances)
"""
class ResourceDirEntryData(DataContainer):
"""Holds resource directory entry data.
struct: IMAGE_RESOURCE_DIRECTORY_ENTRY structure
name: If the resource is identified by name this
attribute will contain the name string. None
otherwise. If identified by id, the id is
available at 'struct.Id'
id: the id, also in struct.Id
directory: If this entry has a lower level directory
this attribute will point to the
ResourceDirData instance representing it.
data: If this entry has no further lower directories
and points to the actual resource data, this
attribute will reference the corresponding
ResourceDataEntryData instance.
(Either of the 'directory' or 'data' attribute will exist,
but not both.)
"""
class ResourceDataEntryData(DataContainer):
"""Holds resource data entry information.
struct: IMAGE_RESOURCE_DATA_ENTRY structure
lang: Primary language ID
sublang: Sublanguage ID
"""
class DebugData(DataContainer):
"""Holds debug information.
struct: IMAGE_DEBUG_DIRECTORY structure
entries: list of entries (IMAGE_DEBUG_TYPE instances)
"""
class DynamicRelocationData(DataContainer):
"""Holds dynamic relocation information.
struct: IMAGE_DYNAMIC_RELOCATION structure
symbol: Symbol to which dynamic relocations must be applied
relocations: List of dynamic relocations for this symbol (BaseRelocationData instances)
"""
class BaseRelocationData(DataContainer):
"""Holds base relocation information.
struct: IMAGE_BASE_RELOCATION structure
entries: list of relocation data (RelocationData instances)
"""
class RelocationData(DataContainer):
"""Holds relocation information.
type: Type of relocation
The type string can be obtained by
RELOCATION_TYPE[type]
rva: RVA of the relocation
"""
def __setattr__(self, name, val):
# If the instance doesn't yet have a struct attribute
# it's not fully initialized so can't do any of the
# following
#
if hasattr(self, "struct"):
# Get the word containing the type and data
#
word = self.struct.Data
if name == "type":
word = (val << 12) | (word & 0xFFF)
elif name == "rva":
offset = max(val - self.base_rva, 0)
word = (word & 0xF000) | (offset & 0xFFF)
# Store the modified data
#
self.struct.Data = word
self.__dict__[name] = val
class TlsData(DataContainer):
"""Holds TLS information.
struct: IMAGE_TLS_DIRECTORY structure
"""
class BoundImportDescData(DataContainer):
"""Holds bound import descriptor data.
This directory entry will provide information on the
DLLs this PE file has been bound to (if bound at all).
The structure will contain the name and timestamp of the
DLL at the time of binding so that the loader can know
whether it differs from the one currently present in the
system and must, therefore, re-bind the PE's imports.
struct: IMAGE_BOUND_IMPORT_DESCRIPTOR structure
name: DLL name
entries: list of entries (BoundImportRefData instances)
the entries will exist if this DLL has forwarded
symbols. If so, the destination DLL will have an
entry in this list.
"""
class LoadConfigData(DataContainer):
"""Holds Load Config data.
struct: IMAGE_LOAD_CONFIG_DIRECTORY structure
name: dll name
dynamic_relocations: dynamic relocation information, if present
"""
class BoundImportRefData(DataContainer):
"""Holds bound import forwarder reference data.
Contains the same information as the bound descriptor but
for forwarded DLLs, if any.
struct: IMAGE_BOUND_FORWARDER_REF structure
name: dll name
"""
class ExceptionsDirEntryData(DataContainer):
"""Holds the data related to SEH (and stack unwinding, in particular)
struct an instance of RUNTIME_FUNTION
unwindinfo an instance of UNWIND_INFO
"""
class UnwindInfo(StructureWithBitfields):
"""Handles the complexities of UNWIND_INFO structure:
* variable number of UWIND_CODEs
* optional ExceptionHandler and FunctionEntry fields
"""
def __init__(self, file_offset=0):
super(UnwindInfo, self).__init__(
(
"UNWIND_INFO",
(
"B:3,Version",
"B:5,Flags",
"B,SizeOfProlog",
"B,CountOfCodes",
"B:4,FrameRegister",
"B:4,FrameOffset",
),
),
file_offset=file_offset,
)
self._full_size = super(UnwindInfo, self).sizeof()
self._opt_field_name = None
self._code_info = StructureWithBitfields(
("UNWIND_CODE", ("B,CodeOffset", "B:4,UnwindOp", "B:4,OpInfo")),
file_offset=0,
)
self._chained_entry = None
self._finished_unpacking = False
def unpack_in_stages(self, data):
"""Unpacks the UNWIND_INFO "in two calls", with the first call establishing
a full size of the structure and the second, performing the actual unpacking.
"""
if self._finished_unpacking:
return None
super(UnwindInfo, self).__unpack__(data)
codes_cnt_max = (self.CountOfCodes + 1) & ~1
hdlr_offset = (
super(UnwindInfo, self).sizeof() + codes_cnt_max * self._code_info.sizeof()
)
self._full_size = hdlr_offset + (
0 if self.Flags == 0 else STRUCT_SIZEOF_TYPES["I"]
)
if len(data) < self._full_size:
return None
if self.Version != 1 and self.Version != 2:
return "Unsupported version of UNWIND_INFO at " + hex(self.__file_offset__)
self.UnwindCodes = []
ro = super(UnwindInfo, self).sizeof()
codes_left = self.CountOfCodes
while codes_left > 0:
self._code_info.__unpack__(data[ro : ro + self._code_info.sizeof()])
ucode = PrologEpilogOpsFactory.create(self._code_info)
if ucode is None:
return "Unknown UNWIND_CODE at " + hex(self.__file_offset__ + ro)
len_in_codes = ucode.length_in_code_structures(self._code_info, self)
opc_size = self._code_info.sizeof() * len_in_codes
ucode.initialize(
self._code_info,
data[ro : ro + opc_size],
self,
self.__file_offset__ + ro,
)
ro += opc_size
codes_left -= len_in_codes
self.UnwindCodes.append(ucode)
if self.UNW_FLAG_EHANDLER or self.UNW_FLAG_UHANDLER:
self._opt_field_name = "ExceptionHandler"
if self.UNW_FLAG_CHAININFO:
self._opt_field_name = "FunctionEntry"
if self._opt_field_name != None:
setattr(
self,
self._opt_field_name,
struct.unpack(
"<I", data[hdlr_offset : hdlr_offset + STRUCT_SIZEOF_TYPES["I"]]
)[0],
)
self._finished_unpacking = True
return None
def dump(self, indentation=0):
# Because __keys_ext__ are shared among all the instances with the same
# format string, we have to add and sunsequently remove the optional field
# each time.
# It saves space (as compared to keeping a copy self.__keys_ext__ per
# UnwindInfo instance), but makes our dump() implementation thread-unsafe.
if self._opt_field_name != None:
self.__field_offsets__[self._opt_field_name] = (
self._full_size - STRUCT_SIZEOF_TYPES["I"]
)
self.__keys_ext__.append([self._opt_field_name])
try:
dump = super(UnwindInfo, self).dump(indentation)
finally:
if self._opt_field_name != None:
self.__keys_ext__.pop()
dump.append(
"Flags: "
+ ", ".join([s[0] for s in unwind_info_flags if getattr(self, s[0])])
)
dump.append(
"Unwind codes: "
+ "; ".join([str(c) for c in self.UnwindCodes if c.is_valid()])
)
return dump
def dump_dict(self):
if self._opt_field_name != None:
self.__field_offsets__[self._opt_field_name] = (
self._full_size - STRUCT_SIZEOF_TYPES["I"]
)
self.__keys_ext__.append([self._opt_field_name])
try:
ret = super(UnwindInfo, self).dump_dict()
finally:
if self._opt_field_name != None:
self.__keys_ext__.pop()
return ret
def __setattr__(self, name, val):
if name == "Flags":
set_flags(self, val, unwind_info_flags)
elif "UNW_FLAG_" in name and hasattr(self, name):
if val:
self.__dict__["Flags"] |= UNWIND_INFO_FLAGS[name]
else:
self.__dict__["Flags"] ^= UNWIND_INFO_FLAGS[name]
self.__dict__[name] = val
def sizeof(self):
return self._full_size
def __pack__(self):
data = bytearray(self._full_size)
data[0 : super(UnwindInfo, self).sizeof()] = super(UnwindInfo, self).__pack__()
cur_offset = super(UnwindInfo, self).sizeof()
for uc in self.UnwindCodes:
if cur_offset + uc.struct.sizeof() > self._full_size:
break
data[cur_offset : cur_offset + uc.struct.sizeof()] = uc.struct.__pack__()
cur_offset += uc.struct.sizeof()
if self._opt_field_name != None:
data[
self._full_size - STRUCT_SIZEOF_TYPES["I"] : self._full_size
] = struct.pack("<I", getattr(self, self._opt_field_name))
return data
def get_chained_function_entry(self):
return self._chained_entry
def set_chained_function_entry(self, entry):
if self._chained_entry != None:
raise PEFormatError("Chained function entry cannot be changed")
self._chained_entry = entry
class PrologEpilogOp:
"""Meant as an abstract class representing a generic unwind code.
There is a subclass of PrologEpilogOp for each member of UNWIND_OP_CODES enum.
"""
def initialize(self, unw_code, data, unw_info, file_offset):
self.struct = StructureWithBitfields(
self._get_format(unw_code), file_offset=file_offset
)
self.struct.__unpack__(data)
def length_in_code_structures(self, unw_code, unw_info):
"""Computes how many UNWIND_CODE structures UNWIND_CODE occupies.
May be called before initialize() and, for that reason, should not rely on
the values of intance attributes.
"""
return 1
def is_valid(self):
return True
def _get_format(self, unw_code):
return ("UNWIND_CODE", ("B,CodeOffset", "B:4,UnwindOp", "B:4,OpInfo"))
class PrologEpilogOpPushReg(PrologEpilogOp):
"""UWOP_PUSH_NONVOL"""
def _get_format(self, unw_code):
return ("UNWIND_CODE_PUSH_NONVOL", ("B,CodeOffset", "B:4,UnwindOp", "B:4,Reg"))
def __str__(self):
return ".PUSHREG " + REGISTERS[self.struct.Reg]
class PrologEpilogOpAllocLarge(PrologEpilogOp):
"""UWOP_ALLOC_LARGE"""
def _get_format(self, unw_code):
return (
"UNWIND_CODE_ALLOC_LARGE",
(
"B,CodeOffset",
"B:4,UnwindOp",
"B:4,OpInfo",
"H,AllocSizeInQwords" if unw_code.OpInfo == 0 else "I,AllocSize",
),
)
def length_in_code_structures(self, unw_code, unw_info):
return 2 if unw_code.OpInfo == 0 else 3
def get_alloc_size(self):
return (
self.struct.AllocSizeInQwords * 8
if self.struct.OpInfo == 0
else self.struct.AllocSize
)
def __str__(self):
return ".ALLOCSTACK " + hex(self.get_alloc_size())
class PrologEpilogOpAllocSmall(PrologEpilogOp):
"""UWOP_ALLOC_SMALL"""
def _get_format(self, unw_code):
return (
"UNWIND_CODE_ALLOC_SMALL",
("B,CodeOffset", "B:4,UnwindOp", "B:4,AllocSizeInQwordsMinus8"),
)
def get_alloc_size(self):
return self.struct.AllocSizeInQwordsMinus8 * 8 + 8
def __str__(self):
return ".ALLOCSTACK " + hex(self.get_alloc_size())
class PrologEpilogOpSetFP(PrologEpilogOp):
"""UWOP_SET_FPREG"""
def initialize(self, unw_code, data, unw_info, file_offset):
super(PrologEpilogOpSetFP, self).initialize(
unw_code, data, unw_info, file_offset
)
self._frame_register = unw_info.FrameRegister
self._frame_offset = unw_info.FrameOffset * 16
def __str__(self):
return (
".SETFRAME "
+ REGISTERS[self._frame_register]
+ ", "
+ hex(self._frame_offset)
)
class PrologEpilogOpSaveReg(PrologEpilogOp):
"""UWOP_SAVE_NONVOL"""
def length_in_code_structures(self, unwcode, unw_info):
return 2
def get_offset(self):
return self.struct.OffsetInQwords * 8
def _get_format(self, unw_code):
return (
"UNWIND_CODE_SAVE_NONVOL",
("B,CodeOffset", "B:4,UnwindOp", "B:4,Reg", "H,OffsetInQwords"),
)
def __str__(self):
return ".SAVEREG " + REGISTERS[self.struct.Reg] + ", " + hex(self.get_offset())
class PrologEpilogOpSaveRegFar(PrologEpilogOp):
"""UWOP_SAVE_NONVOL_FAR"""
def length_in_code_structures(self, unw_code, unw_info):
return 3
def get_offset(self):
return self.struct.Offset
def _get_format(self, unw_code):
return (
"UNWIND_CODE_SAVE_NONVOL_FAR",
("B,CodeOffset", "B:4,UnwindOp", "B:4,Reg", "I,Offset"),
)
def __str__(self):
return ".SAVEREG " + REGISTERS[self.struct.Reg] + ", " + hex(self.struct.Offset)
class PrologEpilogOpSaveXMM(PrologEpilogOp):
"""UWOP_SAVE_XMM128"""
def _get_format(self, unw_code):
return (
"UNWIND_CODE_SAVE_XMM128",
("B,CodeOffset", "B:4,UnwindOp", "B:4,Reg", "H,OffsetIn2Qwords"),
)
def length_in_code_structures(self, unw_code, unw_info):
return 2
def get_offset(self):
return self.struct.OffsetIn2Qwords * 16
def __str__(self):
return ".SAVEXMM128 XMM" + str(self.struct.Reg) + ", " + hex(self.get_offset())
class PrologEpilogOpSaveXMMFar(PrologEpilogOp):
"""UWOP_SAVE_XMM128_FAR"""
def _get_format(self, unw_code):
return (
"UNWIND_CODE_SAVE_XMM128_FAR",
("B,CodeOffset", "B:4,UnwindOp", "B:4,Reg", "I,Offset"),
)
def length_in_code_structures(self, unw_code, unw_info):
return 3
def get_offset(self):
return self.struct.Offset
def __str__(self):
return ".SAVEXMM128 XMM" + str(self.struct.Reg) + ", " + hex(self.struct.Offset)
class PrologEpilogOpPushFrame(PrologEpilogOp):
"""UWOP_PUSH_MACHFRAME"""
def __str__(self):
return ".PUSHFRAME" + (" <code>" if self.struct.OpInfo else "")
class PrologEpilogOpEpilogMarker(PrologEpilogOp):
"""UWOP_EPILOG"""
def initialize(self, unw_code, data, unw_info, file_offset):
self._long_offst = True
self._first = not hasattr(unw_info, "SizeOfEpilog")
super(PrologEpilogOpEpilogMarker, self).initialize(
unw_code, data, unw_info, file_offset
)
if self._first:
setattr(unw_info, "SizeOfEpilog", self.struct.Size)
self._long_offst = unw_code.OpInfo & 1 == 0
self._epilog_size = unw_info.SizeOfEpilog
def _get_format(self, unw_code):
# check if it is the first epilog code among encountered; then its record
# will contain size of the epilog
if self._first:
return (
"UNWIND_CODE_EPILOG",
("B,OffsetLow,Size", "B:4,UnwindOp", "B:4,Flags")
if unw_code.OpInfo & 1 == 1
else (
"B,Size",
"B:4,UnwindOp",
"B:4,Flags",
"B,OffsetLow",
"B:4,Unused",
"B:4,OffsetHigh",
),
)
else:
return (
"UNWIND_CODE_EPILOG",
("B,OffsetLow", "B:4,UnwindOp", "B:4,OffsetHigh"),
)
def length_in_code_structures(self, unw_code, unw_info):
return (
2
if not hasattr(unw_info, "SizeOfEpilog") and (unw_code.OpInfo & 1) == 0
else 1
)
def get_offset(self):
return self.struct.OffsetLow | (
self.struct.OffsetHigh << 8 if self._long_offst else 0
)
def is_valid(self):
return self.get_offset() > 0
def __str__(self):
# the EPILOG sequence may have a terminating all-zeros entry
return (
"EPILOG: size="
+ hex(self._epilog_size)
+ ", offset from the end=-"
+ hex(self.get_offset())
if self.get_offset() > 0
else ""
)
class PrologEpilogOpsFactory:
"""A factory for creating unwind codes based on the value of UnwindOp"""
_class_dict = {
UWOP_PUSH_NONVOL: PrologEpilogOpPushReg,
UWOP_ALLOC_LARGE: PrologEpilogOpAllocLarge,
UWOP_ALLOC_SMALL: PrologEpilogOpAllocSmall,
UWOP_SET_FPREG: PrologEpilogOpSetFP,
UWOP_SAVE_NONVOL: PrologEpilogOpSaveReg,
UWOP_SAVE_NONVOL_FAR: PrologEpilogOpSaveRegFar,
UWOP_SAVE_XMM128: PrologEpilogOpSaveXMM,
UWOP_SAVE_XMM128_FAR: PrologEpilogOpSaveXMMFar,
UWOP_PUSH_MACHFRAME: PrologEpilogOpPushFrame,
UWOP_EPILOG: PrologEpilogOpEpilogMarker,
}
@staticmethod
def create(unwcode):
code = unwcode.UnwindOp
return (
PrologEpilogOpsFactory._class_dict[code]()
if code in PrologEpilogOpsFactory._class_dict
else None
)
# Valid FAT32 8.3 short filename characters according to:
# http://en.wikipedia.org/wiki/8.3_filename
# This will help decide whether DLL ASCII names are likely
# to be valid or otherwise corrupt data
#
# The filename length is not checked because the DLLs filename
# can be longer that the 8.3
allowed_filename = b(
string.ascii_lowercase
+ string.ascii_uppercase
+ string.digits
+ "!#$%&'()-@^_`{}~+,.;=[]"
)
def is_valid_dos_filename(s):
if s is None or not isinstance(s, (str, bytes, bytearray)):
return False
# Allow path separators as import names can contain directories.
allowed = allowed_filename + b"\\/"
return all(c in allowed for c in set(s))
# Check if an imported name uses the valid accepted characters expected in
# mangled function names. If the symbol's characters don't fall within this
# charset we will assume the name is invalid.
# The dot "." character comes from: https://github.com/erocarrera/pefile/pull/346
# All other symbols can be inserted by adding a name with that symbol to a .def file,
# and passing it to link.exe (See export_test.py)
allowed_function_name = b(
string.ascii_lowercase + string.ascii_uppercase + string.digits
)
@lru_cache(maxsize=2048)
def is_valid_function_name(
s: Union[str, bytes, bytearray], relax_allowed_characters: bool = False
) -> bool:
allowed_extra = b"._?@$()<>"
if relax_allowed_characters:
allowed_extra = b"!\"#$%&'()*+,-./:<>?[\\]^_`{|}~@"
return (
s is not None
and isinstance(s, (str, bytes, bytearray))
and all((c in allowed_function_name or c in allowed_extra) for c in set(s))
)
class PE:
"""A Portable Executable representation.
This class provides access to most of the information in a PE file.
It expects to be supplied the name of the file to load or PE data
to process and an optional argument 'fast_load' (False by default)
which controls whether to load all the directories information,
which can be quite time consuming.
pe = pefile.PE('module.dll')
pe = pefile.PE(name='module.dll')
would load 'module.dll' and process it. If the data is already
available in a buffer the same can be achieved with:
pe = pefile.PE(data=module_dll_data)
The "fast_load" can be set to a default by setting its value in the
module itself by means, for instance, of a "pefile.fast_load = True".
That will make all the subsequent instances not to load the
whole PE structure. The "full_load" method can be used to parse
the missing data at a later stage.
Basic headers information will be available in the attributes:
DOS_HEADER
NT_HEADERS
FILE_HEADER
OPTIONAL_HEADER
All of them will contain among their attributes the members of the
corresponding structures as defined in WINNT.H
The raw data corresponding to the header (from the beginning of the
file up to the start of the first section) will be available in the
instance's attribute 'header' as a string.
The sections will be available as a list in the 'sections' attribute.
Each entry will contain as attributes all the structure's members.
Directory entries will be available as attributes (if they exist):
(no other entries are processed at this point)
DIRECTORY_ENTRY_IMPORT (list of ImportDescData instances)
DIRECTORY_ENTRY_EXPORT (ExportDirData instance)
DIRECTORY_ENTRY_RESOURCE (ResourceDirData instance)
DIRECTORY_ENTRY_DEBUG (list of DebugData instances)
DIRECTORY_ENTRY_BASERELOC (list of BaseRelocationData instances)
DIRECTORY_ENTRY_TLS
DIRECTORY_ENTRY_BOUND_IMPORT (list of BoundImportData instances)
The following dictionary attributes provide ways of mapping different
constants. They will accept the numeric value and return the string
representation and the opposite, feed in the string and get the
numeric constant:
DIRECTORY_ENTRY
IMAGE_CHARACTERISTICS
SECTION_CHARACTERISTICS
DEBUG_TYPE
SUBSYSTEM_TYPE
MACHINE_TYPE
RELOCATION_TYPE
RESOURCE_TYPE
LANG
SUBLANG
"""
#
# Format specifications for PE structures.
#
__IMAGE_DOS_HEADER_format__ = (
"IMAGE_DOS_HEADER",
(
"H,e_magic",
"H,e_cblp",
"H,e_cp",
"H,e_crlc",
"H,e_cparhdr",
"H,e_minalloc",
"H,e_maxalloc",
"H,e_ss",
"H,e_sp",
"H,e_csum",
"H,e_ip",
"H,e_cs",
"H,e_lfarlc",
"H,e_ovno",
"8s,e_res",
"H,e_oemid",
"H,e_oeminfo",
"20s,e_res2",
"I,e_lfanew",
),
)
__IMAGE_FILE_HEADER_format__ = (
"IMAGE_FILE_HEADER",
(
"H,Machine",
"H,NumberOfSections",
"I,TimeDateStamp",
"I,PointerToSymbolTable",
"I,NumberOfSymbols",
"H,SizeOfOptionalHeader",
"H,Characteristics",
),
)
__IMAGE_DATA_DIRECTORY_format__ = (
"IMAGE_DATA_DIRECTORY",
("I,VirtualAddress", "I,Size"),
)
__IMAGE_OPTIONAL_HEADER_format__ = (
"IMAGE_OPTIONAL_HEADER",
(
"H,Magic",
"B,MajorLinkerVersion",
"B,MinorLinkerVersion",
"I,SizeOfCode",
"I,SizeOfInitializedData",
"I,SizeOfUninitializedData",
"I,AddressOfEntryPoint",
"I,BaseOfCode",
"I,BaseOfData",
"I,ImageBase",
"I,SectionAlignment",
"I,FileAlignment",
"H,MajorOperatingSystemVersion",
"H,MinorOperatingSystemVersion",
"H,MajorImageVersion",
"H,MinorImageVersion",
"H,MajorSubsystemVersion",
"H,MinorSubsystemVersion",
"I,Reserved1",
"I,SizeOfImage",
"I,SizeOfHeaders",
"I,CheckSum",
"H,Subsystem",
"H,DllCharacteristics",
"I,SizeOfStackReserve",
"I,SizeOfStackCommit",
"I,SizeOfHeapReserve",
"I,SizeOfHeapCommit",
"I,LoaderFlags",
"I,NumberOfRvaAndSizes",
),
)
__IMAGE_OPTIONAL_HEADER64_format__ = (
"IMAGE_OPTIONAL_HEADER64",
(
"H,Magic",
"B,MajorLinkerVersion",
"B,MinorLinkerVersion",
"I,SizeOfCode",
"I,SizeOfInitializedData",
"I,SizeOfUninitializedData",
"I,AddressOfEntryPoint",
"I,BaseOfCode",
"Q,ImageBase",
"I,SectionAlignment",
"I,FileAlignment",
"H,MajorOperatingSystemVersion",
"H,MinorOperatingSystemVersion",
"H,MajorImageVersion",
"H,MinorImageVersion",
"H,MajorSubsystemVersion",
"H,MinorSubsystemVersion",
"I,Reserved1",
"I,SizeOfImage",
"I,SizeOfHeaders",
"I,CheckSum",
"H,Subsystem",
"H,DllCharacteristics",
"Q,SizeOfStackReserve",
"Q,SizeOfStackCommit",
"Q,SizeOfHeapReserve",
"Q,SizeOfHeapCommit",
"I,LoaderFlags",
"I,NumberOfRvaAndSizes",
),
)
__IMAGE_NT_HEADERS_format__ = ("IMAGE_NT_HEADERS", ("I,Signature",))
__IMAGE_SECTION_HEADER_format__ = (
"IMAGE_SECTION_HEADER",
(
"8s,Name",
"I,Misc,Misc_PhysicalAddress,Misc_VirtualSize",
"I,VirtualAddress",
"I,SizeOfRawData",
"I,PointerToRawData",
"I,PointerToRelocations",
"I,PointerToLinenumbers",
"H,NumberOfRelocations",
"H,NumberOfLinenumbers",
"I,Characteristics",
),
)
__IMAGE_DELAY_IMPORT_DESCRIPTOR_format__ = (
"IMAGE_DELAY_IMPORT_DESCRIPTOR",
(
"I,grAttrs",
"I,szName",
"I,phmod",
"I,pIAT",
"I,pINT",
"I,pBoundIAT",
"I,pUnloadIAT",
"I,dwTimeStamp",
),
)
__IMAGE_IMPORT_DESCRIPTOR_format__ = (
"IMAGE_IMPORT_DESCRIPTOR",
(
"I,OriginalFirstThunk,Characteristics",
"I,TimeDateStamp",
"I,ForwarderChain",
"I,Name",
"I,FirstThunk",
),
)
__IMAGE_EXPORT_DIRECTORY_format__ = (
"IMAGE_EXPORT_DIRECTORY",
(
"I,Characteristics",
"I,TimeDateStamp",
"H,MajorVersion",
"H,MinorVersion",
"I,Name",
"I,Base",
"I,NumberOfFunctions",
"I,NumberOfNames",
"I,AddressOfFunctions",
"I,AddressOfNames",
"I,AddressOfNameOrdinals",
),
)
__IMAGE_RESOURCE_DIRECTORY_format__ = (
"IMAGE_RESOURCE_DIRECTORY",
(
"I,Characteristics",
"I,TimeDateStamp",
"H,MajorVersion",
"H,MinorVersion",
"H,NumberOfNamedEntries",
"H,NumberOfIdEntries",
),
)
__IMAGE_RESOURCE_DIRECTORY_ENTRY_format__ = (
"IMAGE_RESOURCE_DIRECTORY_ENTRY",
("I,Name", "I,OffsetToData"),
)
__IMAGE_RESOURCE_DATA_ENTRY_format__ = (
"IMAGE_RESOURCE_DATA_ENTRY",
("I,OffsetToData", "I,Size", "I,CodePage", "I,Reserved"),
)
__VS_VERSIONINFO_format__ = (
"VS_VERSIONINFO",
("H,Length", "H,ValueLength", "H,Type"),
)
__VS_FIXEDFILEINFO_format__ = (
"VS_FIXEDFILEINFO",
(
"I,Signature",
"I,StrucVersion",
"I,FileVersionMS",
"I,FileVersionLS",
"I,ProductVersionMS",
"I,ProductVersionLS",
"I,FileFlagsMask",
"I,FileFlags",
"I,FileOS",
"I,FileType",
"I,FileSubtype",
"I,FileDateMS",
"I,FileDateLS",
),
)
__StringFileInfo_format__ = (
"StringFileInfo",
("H,Length", "H,ValueLength", "H,Type"),
)
__StringTable_format__ = ("StringTable", ("H,Length", "H,ValueLength", "H,Type"))
__String_format__ = ("String", ("H,Length", "H,ValueLength", "H,Type"))
__Var_format__ = ("Var", ("H,Length", "H,ValueLength", "H,Type"))
__IMAGE_THUNK_DATA_format__ = (
"IMAGE_THUNK_DATA",
("I,ForwarderString,Function,Ordinal,AddressOfData",),
)
__IMAGE_THUNK_DATA64_format__ = (
"IMAGE_THUNK_DATA",
("Q,ForwarderString,Function,Ordinal,AddressOfData",),
)
__IMAGE_DEBUG_DIRECTORY_format__ = (
"IMAGE_DEBUG_DIRECTORY",
(
"I,Characteristics",
"I,TimeDateStamp",
"H,MajorVersion",
"H,MinorVersion",
"I,Type",
"I,SizeOfData",
"I,AddressOfRawData",
"I,PointerToRawData",
),
)
__IMAGE_BASE_RELOCATION_format__ = (
"IMAGE_BASE_RELOCATION",
("I,VirtualAddress", "I,SizeOfBlock"),
)
__IMAGE_BASE_RELOCATION_ENTRY_format__ = (
"IMAGE_BASE_RELOCATION_ENTRY",
("H,Data",),
)
__IMAGE_IMPORT_CONTROL_TRANSFER_DYNAMIC_RELOCATION_format__ = (
"IMAGE_IMPORT_CONTROL_TRANSFER_DYNAMIC_RELOCATION",
("I:12,PageRelativeOffset", "I:1,IndirectCall", "I:19,IATIndex"),
)
__IMAGE_INDIR_CONTROL_TRANSFER_DYNAMIC_RELOCATION_format__ = (
"IMAGE_INDIR_CONTROL_TRANSFER_DYNAMIC_RELOCATION",
(
"I:12,PageRelativeOffset",
"I:1,IndirectCall",
"I:1,RexWPrefix",
"I:1,CfgCheck",
"I:1,Reserved",
),
)
__IMAGE_SWITCHTABLE_BRANCH_DYNAMIC_RELOCATION_format__ = (
"IMAGE_SWITCHTABLE_BRANCH_DYNAMIC_RELOCATION",
("I:12,PageRelativeOffset", "I:4,RegisterNumber"),
)
__IMAGE_TLS_DIRECTORY_format__ = (
"IMAGE_TLS_DIRECTORY",
(
"I,StartAddressOfRawData",
"I,EndAddressOfRawData",
"I,AddressOfIndex",
"I,AddressOfCallBacks",
"I,SizeOfZeroFill",
"I,Characteristics",
),
)
__IMAGE_TLS_DIRECTORY64_format__ = (
"IMAGE_TLS_DIRECTORY",
(
"Q,StartAddressOfRawData",
"Q,EndAddressOfRawData",
"Q,AddressOfIndex",
"Q,AddressOfCallBacks",
"I,SizeOfZeroFill",
"I,Characteristics",
),
)
__IMAGE_LOAD_CONFIG_DIRECTORY_format__ = (
"IMAGE_LOAD_CONFIG_DIRECTORY",
(
"I,Size",
"I,TimeDateStamp",
"H,MajorVersion",
"H,MinorVersion",
"I,GlobalFlagsClear",
"I,GlobalFlagsSet",
"I,CriticalSectionDefaultTimeout",
"I,DeCommitFreeBlockThreshold",
"I,DeCommitTotalFreeThreshold",
"I,LockPrefixTable",
"I,MaximumAllocationSize",
"I,VirtualMemoryThreshold",
"I,ProcessHeapFlags",
"I,ProcessAffinityMask",
"H,CSDVersion",
"H,Reserved1",
"I,EditList",
"I,SecurityCookie",
"I,SEHandlerTable",
"I,SEHandlerCount",
"I,GuardCFCheckFunctionPointer",
"I,GuardCFDispatchFunctionPointer",
"I,GuardCFFunctionTable",
"I,GuardCFFunctionCount",
"I,GuardFlags",
"H,CodeIntegrityFlags",
"H,CodeIntegrityCatalog",
"I,CodeIntegrityCatalogOffset",
"I,CodeIntegrityReserved",
"I,GuardAddressTakenIatEntryTable",
"I,GuardAddressTakenIatEntryCount",
"I,GuardLongJumpTargetTable",
"I,GuardLongJumpTargetCount",
"I,DynamicValueRelocTable",
"I,CHPEMetadataPointer",
"I,GuardRFFailureRoutine",
"I,GuardRFFailureRoutineFunctionPointer",
"I,DynamicValueRelocTableOffset",
"H,DynamicValueRelocTableSection",
"H,Reserved2",
"I,GuardRFVerifyStackPointerFunctionPointer" "I,HotPatchTableOffset",
"I,Reserved3",
"I,EnclaveConfigurationPointer",
),
)
__IMAGE_LOAD_CONFIG_DIRECTORY64_format__ = (
"IMAGE_LOAD_CONFIG_DIRECTORY",
(
"I,Size",
"I,TimeDateStamp",
"H,MajorVersion",
"H,MinorVersion",
"I,GlobalFlagsClear",
"I,GlobalFlagsSet",
"I,CriticalSectionDefaultTimeout",
"Q,DeCommitFreeBlockThreshold",
"Q,DeCommitTotalFreeThreshold",
"Q,LockPrefixTable",
"Q,MaximumAllocationSize",
"Q,VirtualMemoryThreshold",
"Q,ProcessAffinityMask",
"I,ProcessHeapFlags",
"H,CSDVersion",
"H,Reserved1",
"Q,EditList",
"Q,SecurityCookie",
"Q,SEHandlerTable",
"Q,SEHandlerCount",
"Q,GuardCFCheckFunctionPointer",
"Q,GuardCFDispatchFunctionPointer",
"Q,GuardCFFunctionTable",
"Q,GuardCFFunctionCount",
"I,GuardFlags",
"H,CodeIntegrityFlags",
"H,CodeIntegrityCatalog",
"I,CodeIntegrityCatalogOffset",
"I,CodeIntegrityReserved",
"Q,GuardAddressTakenIatEntryTable",
"Q,GuardAddressTakenIatEntryCount",
"Q,GuardLongJumpTargetTable",
"Q,GuardLongJumpTargetCount",
"Q,DynamicValueRelocTable",
"Q,CHPEMetadataPointer",
"Q,GuardRFFailureRoutine",
"Q,GuardRFFailureRoutineFunctionPointer",
"I,DynamicValueRelocTableOffset",
"H,DynamicValueRelocTableSection",
"H,Reserved2",
"Q,GuardRFVerifyStackPointerFunctionPointer",
"I,HotPatchTableOffset",
"I,Reserved3",
"Q,EnclaveConfigurationPointer",
),
)
__IMAGE_DYNAMIC_RELOCATION_TABLE_format__ = (
"IMAGE_DYNAMIC_RELOCATION_TABLE",
("I,Version", "I,Size"),
)
__IMAGE_DYNAMIC_RELOCATION_format__ = (
"IMAGE_DYNAMIC_RELOCATION",
("I,Symbol", "I,BaseRelocSize"),
)
__IMAGE_DYNAMIC_RELOCATION64_format__ = (
"IMAGE_DYNAMIC_RELOCATION64",
("Q,Symbol", "I,BaseRelocSize"),
)
__IMAGE_DYNAMIC_RELOCATION_V2_format__ = (
"IMAGE_DYNAMIC_RELOCATION_V2",
("I,HeaderSize", "I,FixupInfoSize", "I,Symbol", "I,SymbolGroup", "I,Flags"),
)
__IMAGE_DYNAMIC_RELOCATION64_V2_format__ = (
"IMAGE_DYNAMIC_RELOCATION64_V2",
("I,HeaderSize", "I,FixupInfoSize", "Q,Symbol", "I,SymbolGroup", "I,Flags"),
)
__IMAGE_BOUND_IMPORT_DESCRIPTOR_format__ = (
"IMAGE_BOUND_IMPORT_DESCRIPTOR",
("I,TimeDateStamp", "H,OffsetModuleName", "H,NumberOfModuleForwarderRefs"),
)
__IMAGE_BOUND_FORWARDER_REF_format__ = (
"IMAGE_BOUND_FORWARDER_REF",
("I,TimeDateStamp", "H,OffsetModuleName", "H,Reserved"),
)
__RUNTIME_FUNCTION_format__ = (
"RUNTIME_FUNCTION",
("I,BeginAddress", "I,EndAddress", "I,UnwindData"),
)
def __init__(
self,
name=None,
data=None,
fast_load=None,
max_symbol_exports=MAX_SYMBOL_EXPORT_COUNT,
max_repeated_symbol=120,
):
self.max_symbol_exports = max_symbol_exports
self.max_repeated_symbol = max_repeated_symbol
self._get_section_by_rva_last_used = None
self.sections = []
self.__warnings = []
self.PE_TYPE = None
if name is None and data is None:
raise ValueError("Must supply either name or data")
# This list will keep track of all the structures created.
# That will allow for an easy iteration through the list
# in order to save the modifications made
self.__structures__ = []
self.__from_file = None
# We only want to print these warnings once
self.FileAlignment_Warning = False
self.SectionAlignment_Warning = False
# Count of total resource entries across nested tables
self.__total_resource_entries_count = 0
# Sum of the size of all resource entries parsed, which should not
# exceed the file size.
self.__total_resource_bytes = 0
# The number of imports parsed in this file
self.__total_import_symbols = 0
self.dynamic_relocation_format_by_symbol = {
3: PE.__IMAGE_IMPORT_CONTROL_TRANSFER_DYNAMIC_RELOCATION_format__,
4: PE.__IMAGE_INDIR_CONTROL_TRANSFER_DYNAMIC_RELOCATION_format__,
5: PE.__IMAGE_SWITCHTABLE_BRANCH_DYNAMIC_RELOCATION_format__,
}
fast_load = fast_load if fast_load is not None else globals()["fast_load"]
try:
self.__parse__(name, data, fast_load)
except:
self.close()
raise
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
self.close()
def close(self):
if (
self.__from_file is True
and hasattr(self, "__data__")
and (
(isinstance(mmap.mmap, type) and isinstance(self.__data__, mmap.mmap))
or "mmap.mmap" in repr(type(self.__data__))
)
):
self.__data__.close()
del self.__data__
def __unpack_data__(self, format, data, file_offset):
"""Apply structure format to raw data.
Returns an unpacked structure object if successful, None otherwise.
"""
structure = Structure(format, file_offset=file_offset)
try:
structure.__unpack__(data)
except PEFormatError as err:
self.__warnings.append(
'Corrupt header "{0}" at file offset {1}. Exception: {2}'.format(
format[0], file_offset, err
)
)
return None
self.__structures__.append(structure)
return structure
def __unpack_data_with_bitfields__(self, format, data, file_offset):
"""Apply structure format to raw data.
Returns an unpacked structure object if successful, None otherwise.
"""
structure = StructureWithBitfields(format, file_offset=file_offset)
try:
structure.__unpack__(data)
except PEFormatError as err:
self.__warnings.append(
'Corrupt header "{0}" at file offset {1}. Exception: {2}'.format(
format[0], file_offset, err
)
)
return None
self.__structures__.append(structure)
return structure
def __parse__(self, fname, data, fast_load):
"""Parse a Portable Executable file.
Loads a PE file, parsing all its structures and making them available
through the instance's attributes.
"""
if fname is not None:
stat = os.stat(fname)
if stat.st_size == 0:
raise PEFormatError("The file is empty")
fd = None
try:
fd = open(fname, "rb")
self.fileno = fd.fileno()
if hasattr(mmap, "MAP_PRIVATE"):
# Unix
self.__data__ = mmap.mmap(self.fileno, 0, mmap.MAP_PRIVATE)
else:
# Windows
self.__data__ = mmap.mmap(self.fileno, 0, access=mmap.ACCESS_READ)
self.__from_file = True
except IOError as excp:
exception_msg = "{0}".format(excp)
exception_msg = exception_msg and (": %s" % exception_msg)
raise Exception(
"Unable to access file '{0}'{1}".format(fname, exception_msg)
)
finally:
if fd is not None:
fd.close()
elif data is not None:
self.__data__ = data
self.__from_file = False
# Resources should not overlap each other, so they should not exceed the
# file size.
self.__resource_size_limit_upperbounds = len(self.__data__)
self.__resource_size_limit_reached = False
if not fast_load:
for byte, byte_count in Counter(bytearray(self.__data__)).items():
# Only report the cases where a byte makes up for more than 50% (if
# zero) or 15% (if non-zero) of the file's contents. There are
# legitimate PEs where 0x00 bytes are close to 50% of the whole
# file's contents.
if (byte == 0 and byte_count / len(self.__data__) > 0.5) or (
byte != 0 and byte_count / len(self.__data__) > 0.15
):
self.__warnings.append(
(
"Byte 0x{0:02x} makes up {1:.4f}% of the file's contents."
" This may indicate truncation / malformation."
).format(byte, 100.0 * byte_count / len(self.__data__))
)
dos_header_data = self.__data__[:64]
if len(dos_header_data) != 64:
raise PEFormatError(
"Unable to read the DOS Header, possibly a truncated file."
)
self.DOS_HEADER = self.__unpack_data__(
self.__IMAGE_DOS_HEADER_format__, dos_header_data, file_offset=0
)
if self.DOS_HEADER.e_magic == IMAGE_DOSZM_SIGNATURE:
raise PEFormatError("Probably a ZM Executable (not a PE file).")
if not self.DOS_HEADER or self.DOS_HEADER.e_magic != IMAGE_DOS_SIGNATURE:
raise PEFormatError("DOS Header magic not found.")
# OC Patch:
# Check for sane value in e_lfanew
#
if self.DOS_HEADER.e_lfanew > len(self.__data__):
raise PEFormatError("Invalid e_lfanew value, probably not a PE file")
nt_headers_offset = self.DOS_HEADER.e_lfanew
self.NT_HEADERS = self.__unpack_data__(
self.__IMAGE_NT_HEADERS_format__,
self.__data__[nt_headers_offset : nt_headers_offset + 8],
file_offset=nt_headers_offset,
)
# We better check the signature right here, before the file screws
# around with sections:
# OC Patch:
# Some malware will cause the Signature value to not exist at all
if not self.NT_HEADERS or not self.NT_HEADERS.Signature:
raise PEFormatError("NT Headers not found.")
if (0xFFFF & self.NT_HEADERS.Signature) == IMAGE_NE_SIGNATURE:
raise PEFormatError("Invalid NT Headers signature. Probably a NE file")
if (0xFFFF & self.NT_HEADERS.Signature) == IMAGE_LE_SIGNATURE:
raise PEFormatError("Invalid NT Headers signature. Probably a LE file")
if (0xFFFF & self.NT_HEADERS.Signature) == IMAGE_LX_SIGNATURE:
raise PEFormatError("Invalid NT Headers signature. Probably a LX file")
if (0xFFFF & self.NT_HEADERS.Signature) == IMAGE_TE_SIGNATURE:
raise PEFormatError("Invalid NT Headers signature. Probably a TE file")
if self.NT_HEADERS.Signature != IMAGE_NT_SIGNATURE:
raise PEFormatError("Invalid NT Headers signature.")
self.FILE_HEADER = self.__unpack_data__(
self.__IMAGE_FILE_HEADER_format__,
self.__data__[nt_headers_offset + 4 : nt_headers_offset + 4 + 32],
file_offset=nt_headers_offset + 4,
)
image_flags = retrieve_flags(IMAGE_CHARACTERISTICS, "IMAGE_FILE_")
if not self.FILE_HEADER:
raise PEFormatError("File Header missing")
# Set the image's flags according the the Characteristics member
set_flags(self.FILE_HEADER, self.FILE_HEADER.Characteristics, image_flags)
optional_header_offset = nt_headers_offset + 4 + self.FILE_HEADER.sizeof()
# Note: location of sections can be controlled from PE header:
sections_offset = optional_header_offset + self.FILE_HEADER.SizeOfOptionalHeader
self.OPTIONAL_HEADER = self.__unpack_data__(
self.__IMAGE_OPTIONAL_HEADER_format__,
# Read up to 256 bytes to allow creating a copy of too much data
self.__data__[optional_header_offset : optional_header_offset + 256],
file_offset=optional_header_offset,
)
# According to solardesigner's findings for his
# Tiny PE project, the optional header does not
# need fields beyond "Subsystem" in order to be
# loadable by the Windows loader (given that zeros
# are acceptable values and the header is loaded
# in a zeroed memory page)
# If trying to parse a full Optional Header fails
# we try to parse it again with some 0 padding
#
MINIMUM_VALID_OPTIONAL_HEADER_RAW_SIZE = 69
if (
self.OPTIONAL_HEADER is None
and len(
self.__data__[optional_header_offset : optional_header_offset + 0x200]
)
>= MINIMUM_VALID_OPTIONAL_HEADER_RAW_SIZE
):
# Add enough zeros to make up for the unused fields
#
padding_length = 128
# Create padding
#
padded_data = self.__data__[
optional_header_offset : optional_header_offset + 0x200
] + (b"\0" * padding_length)
self.OPTIONAL_HEADER = self.__unpack_data__(
self.__IMAGE_OPTIONAL_HEADER_format__,
padded_data,
file_offset=optional_header_offset,
)
# Check the Magic in the OPTIONAL_HEADER and set the PE file
# type accordingly
#
if self.OPTIONAL_HEADER is not None:
if self.OPTIONAL_HEADER.Magic == OPTIONAL_HEADER_MAGIC_PE:
self.PE_TYPE = OPTIONAL_HEADER_MAGIC_PE
elif self.OPTIONAL_HEADER.Magic == OPTIONAL_HEADER_MAGIC_PE_PLUS:
self.PE_TYPE = OPTIONAL_HEADER_MAGIC_PE_PLUS
self.OPTIONAL_HEADER = self.__unpack_data__(
self.__IMAGE_OPTIONAL_HEADER64_format__,
self.__data__[
optional_header_offset : optional_header_offset + 0x200
],
file_offset=optional_header_offset,
)
# Again, as explained above, we try to parse
# a reduced form of the Optional Header which
# is still valid despite not including all
# structure members
#
MINIMUM_VALID_OPTIONAL_HEADER_RAW_SIZE = 69 + 4
if (
self.OPTIONAL_HEADER is None
and len(
self.__data__[
optional_header_offset : optional_header_offset + 0x200
]
)
>= MINIMUM_VALID_OPTIONAL_HEADER_RAW_SIZE
):
padding_length = 128
padded_data = self.__data__[
optional_header_offset : optional_header_offset + 0x200
] + (b"\0" * padding_length)
self.OPTIONAL_HEADER = self.__unpack_data__(
self.__IMAGE_OPTIONAL_HEADER64_format__,
padded_data,
file_offset=optional_header_offset,
)
if not self.FILE_HEADER:
raise PEFormatError("File Header missing")
# OC Patch:
# Die gracefully if there is no OPTIONAL_HEADER field
# 975440f5ad5e2e4a92c4d9a5f22f75c1
if self.OPTIONAL_HEADER is None:
raise PEFormatError("No Optional Header found, invalid PE32 or PE32+ file.")
if self.PE_TYPE is None:
self.__warnings.append(
"Invalid type 0x{0:04x} in Optional Header.".format(
self.OPTIONAL_HEADER.Magic
)
)
dll_characteristics_flags = retrieve_flags(
DLL_CHARACTERISTICS, "IMAGE_DLLCHARACTERISTICS_"
)
# Set the Dll Characteristics flags according the the DllCharacteristics member
set_flags(
self.OPTIONAL_HEADER,
self.OPTIONAL_HEADER.DllCharacteristics,
dll_characteristics_flags,
)
self.OPTIONAL_HEADER.DATA_DIRECTORY = []
# offset = (optional_header_offset + self.FILE_HEADER.SizeOfOptionalHeader)
offset = optional_header_offset + self.OPTIONAL_HEADER.sizeof()
self.NT_HEADERS.FILE_HEADER = self.FILE_HEADER
self.NT_HEADERS.OPTIONAL_HEADER = self.OPTIONAL_HEADER
# Windows 8 specific check
#
if (
self.OPTIONAL_HEADER.AddressOfEntryPoint
< self.OPTIONAL_HEADER.SizeOfHeaders
):
self.__warnings.append(
"SizeOfHeaders is smaller than AddressOfEntryPoint: this file "
"cannot run under Windows 8."
)
# The NumberOfRvaAndSizes is sanitized to stay within
# reasonable limits so can be casted to an int
#
if self.OPTIONAL_HEADER.NumberOfRvaAndSizes > 0x10:
self.__warnings.append(
"Suspicious NumberOfRvaAndSizes in the Optional Header. "
"Normal values are never larger than 0x10, the value is: 0x%x"
% self.OPTIONAL_HEADER.NumberOfRvaAndSizes
)
MAX_ASSUMED_VALID_NUMBER_OF_RVA_AND_SIZES = 0x100
for i in range(int(0x7FFFFFFF & self.OPTIONAL_HEADER.NumberOfRvaAndSizes)):
if len(self.__data__) - offset == 0:
break
if len(self.__data__) - offset < 8:
data = self.__data__[offset:] + b"\0" * 8
else:
data = self.__data__[
offset : offset + MAX_ASSUMED_VALID_NUMBER_OF_RVA_AND_SIZES
]
dir_entry = self.__unpack_data__(
self.__IMAGE_DATA_DIRECTORY_format__, data, file_offset=offset
)
if dir_entry is None:
break
# Would fail if missing an entry
# 1d4937b2fa4d84ad1bce0309857e70ca offending sample
try:
dir_entry.name = DIRECTORY_ENTRY[i]
except (KeyError, AttributeError):
break
offset += dir_entry.sizeof()
self.OPTIONAL_HEADER.DATA_DIRECTORY.append(dir_entry)
# If the offset goes outside the optional header,
# the loop is broken, regardless of how many directories
# NumberOfRvaAndSizes says there are
#
# We assume a normally sized optional header, hence that we do
# a sizeof() instead of reading SizeOfOptionalHeader.
# Then we add a default number of directories times their size,
# if we go beyond that, we assume the number of directories
# is wrong and stop processing
if offset >= (
optional_header_offset + self.OPTIONAL_HEADER.sizeof() + 8 * 16
):
break
offset = self.parse_sections(sections_offset)
# OC Patch:
# There could be a problem if there are no raw data sections
# greater than 0
# fc91013eb72529da005110a3403541b6 example
# Should this throw an exception in the minimum header offset
# can't be found?
#
rawDataPointers = [
self.adjust_FileAlignment(
s.PointerToRawData, self.OPTIONAL_HEADER.FileAlignment
)
for s in self.sections
if s.PointerToRawData > 0
]
if len(rawDataPointers) > 0:
lowest_section_offset = min(rawDataPointers)
else:
lowest_section_offset = None
if not lowest_section_offset or lowest_section_offset < offset:
self.header = self.__data__[:offset]
else:
self.header = self.__data__[:lowest_section_offset]
# Check whether the entry point lies within a section
#
if (
self.get_section_by_rva(self.OPTIONAL_HEADER.AddressOfEntryPoint)
is not None
):
# Check whether the entry point lies within the file
#
ep_offset = self.get_offset_from_rva(
self.OPTIONAL_HEADER.AddressOfEntryPoint
)
if ep_offset > len(self.__data__):
self.__warnings.append(
"Possibly corrupt file. AddressOfEntryPoint lies outside the"
" file. AddressOfEntryPoint: 0x%x"
% self.OPTIONAL_HEADER.AddressOfEntryPoint
)
else:
self.__warnings.append(
"AddressOfEntryPoint lies outside the sections' boundaries. "
"AddressOfEntryPoint: 0x%x" % self.OPTIONAL_HEADER.AddressOfEntryPoint
)
if not fast_load:
self.full_load()
def parse_rich_header(self):
"""Parses the rich header
see http://www.ntcore.com/files/richsign.htm for more information
Structure:
00 DanS ^ checksum, checksum, checksum, checksum
10 Symbol RVA ^ checksum, Symbol size ^ checksum...
...
XX Rich, checksum, 0, 0,...
"""
# Rich Header constants
#
DANS = 0x536E6144 # 'DanS' as dword
RICH = 0x68636952 # 'Rich' as dword
rich_index = self.__data__.find(
b"Rich", 0x80, self.OPTIONAL_HEADER.get_file_offset()
)
if rich_index == -1:
return None
# Read a block of data
try:
# The end of the structure is 8 bytes after the start of the Rich
# string.
rich_data = self.__data__[0x80 : rich_index + 8]
# Make the data have length a multiple of 4, otherwise the
# subsequent parsing will fail. It's not impossible that we retrieve
# truncated data that it's not a multiple.
rich_data = rich_data[: 4 * int(len(rich_data) / 4)]
data = list(
struct.unpack("<{0}I".format(int(len(rich_data) / 4)), rich_data)
)
if RICH not in data:
return None
except PEFormatError:
return None
# get key, raw_data and clear_data
key = struct.pack("<L", data[data.index(RICH) + 1])
result = {"key": key}
raw_data = rich_data[: rich_data.find(b"Rich")]
result["raw_data"] = raw_data
ord_ = lambda c: ord(c) if not isinstance(c, int) else c
clear_data = bytearray()
for idx, val in enumerate(raw_data):
clear_data.append((ord_(val) ^ ord_(key[idx % len(key)])))
result["clear_data"] = bytes(clear_data)
# the checksum should be present 3 times after the DanS signature
#
checksum = data[1]
if data[0] ^ checksum != DANS or data[2] != checksum or data[3] != checksum:
return None
result["checksum"] = checksum
headervalues = []
result["values"] = headervalues
data = data[4:]
for i in range(int(len(data) / 2)):
# Stop until the Rich footer signature is found
#
if data[2 * i] == RICH:
# it should be followed by the checksum
#
if data[2 * i + 1] != checksum:
self.__warnings.append("Rich Header is malformed")
break
# header values come by pairs
#
headervalues += [data[2 * i] ^ checksum, data[2 * i + 1] ^ checksum]
return result
def get_warnings(self):
"""Return the list of warnings.
Non-critical problems found when parsing the PE file are
appended to a list of warnings. This method returns the
full list.
"""
return self.__warnings
def show_warnings(self):
"""Print the list of warnings.
Non-critical problems found when parsing the PE file are
appended to a list of warnings. This method prints the
full list to standard output.
"""
for warning in self.__warnings:
print(">", warning)
def full_load(self):
"""Process the data directories.
This method will load the data directories which might not have
been loaded if the "fast_load" option was used.
"""
self.parse_data_directories()
class RichHeader:
pass
rich_header = self.parse_rich_header()
if rich_header:
self.RICH_HEADER = RichHeader()
self.RICH_HEADER.checksum = rich_header.get("checksum", None)
self.RICH_HEADER.values = rich_header.get("values", None)
self.RICH_HEADER.key = rich_header.get("key", None)
self.RICH_HEADER.raw_data = rich_header.get("raw_data", None)
self.RICH_HEADER.clear_data = rich_header.get("clear_data", None)
else:
self.RICH_HEADER = None
def write(self, filename=None):
"""Write the PE file.
This function will process all headers and components
of the PE file and include all changes made (by just
assigning to attributes in the PE objects) and write
the changes back to a file whose name is provided as
an argument. The filename is optional, if not
provided the data will be returned as a 'str' object.
"""
file_data = bytearray(self.__data__)
for structure in self.__structures__:
struct_data = bytearray(structure.__pack__())
offset = structure.get_file_offset()
file_data[offset : offset + len(struct_data)] = struct_data
if hasattr(self, "VS_VERSIONINFO"):
if hasattr(self, "FileInfo"):
for finfo in self.FileInfo:
for entry in finfo:
if hasattr(entry, "StringTable"):
for st_entry in entry.StringTable:
for key, entry in list(st_entry.entries.items()):
# Offsets and lengths of the keys and values.
# Each value in the dictionary is a tuple:
# (key length, value length)
# The lengths are in characters, not in bytes.
offsets = st_entry.entries_offsets[key]
lengths = st_entry.entries_lengths[key]
if len(entry) > lengths[1]:
l = entry.decode("utf-8").encode("utf-16le")
file_data[
offsets[1] : offsets[1] + lengths[1] * 2
] = l[: lengths[1] * 2]
else:
encoded_data = entry.decode("utf-8").encode(
"utf-16le"
)
file_data[
offsets[1] : offsets[1] + len(encoded_data)
] = encoded_data
new_file_data = file_data
if not filename:
return new_file_data
f = open(filename, "wb+")
f.write(new_file_data)
f.close()
return
def parse_sections(self, offset):
"""Fetch the PE file sections.
The sections will be readily available in the "sections" attribute.
Its attributes will contain all the section information plus "data"
a buffer containing the section's data.
The "Characteristics" member will be processed and attributes
representing the section characteristics (with the 'IMAGE_SCN_'
string trimmed from the constant's names) will be added to the
section instance.
Refer to the SectionStructure class for additional info.
"""
self.sections = []
MAX_SIMULTANEOUS_ERRORS = 3
for i in range(self.FILE_HEADER.NumberOfSections):
if i >= MAX_SECTIONS:
self.__warnings.append(
"Too many sections {0} (>={1})".format(
self.FILE_HEADER.NumberOfSections, MAX_SECTIONS
)
)
break
simultaneous_errors = 0
section = SectionStructure(self.__IMAGE_SECTION_HEADER_format__, pe=self)
if not section:
break
section_offset = offset + section.sizeof() * i
section.set_file_offset(section_offset)
section_data = self.__data__[
section_offset : section_offset + section.sizeof()
]
# Check if the section is all nulls and stop if so.
if count_zeroes(section_data) == section.sizeof():
self.__warnings.append(f"Invalid section {i}. Contents are null-bytes.")
break
if not section_data:
self.__warnings.append(
f"Invalid section {i}. No data in the file (is this corkami's "
"virtsectblXP?)."
)
break
section.__unpack__(section_data)
self.__structures__.append(section)
if section.SizeOfRawData + section.PointerToRawData > len(self.__data__):
simultaneous_errors += 1
self.__warnings.append(
f"Error parsing section {i}. SizeOfRawData is larger than file."
)
if self.adjust_FileAlignment(
section.PointerToRawData, self.OPTIONAL_HEADER.FileAlignment
) > len(self.__data__):
simultaneous_errors += 1
self.__warnings.append(
f"Error parsing section {i}. PointerToRawData points beyond "
"the end of the file."
)
if section.Misc_VirtualSize > 0x10000000:
simultaneous_errors += 1
self.__warnings.append(
f"Suspicious value found parsing section {i}. VirtualSize is "
"extremely large > 256MiB."
)
if (
self.adjust_SectionAlignment(
section.VirtualAddress,
self.OPTIONAL_HEADER.SectionAlignment,
self.OPTIONAL_HEADER.FileAlignment,
)
> 0x10000000
):
simultaneous_errors += 1
self.__warnings.append(
f"Suspicious value found parsing section {i}. VirtualAddress is "
"beyond 0x10000000."
)
if (
self.OPTIONAL_HEADER.FileAlignment != 0
and (section.PointerToRawData % self.OPTIONAL_HEADER.FileAlignment) != 0
):
simultaneous_errors += 1
self.__warnings.append(
(
f"Error parsing section {i}. "
"PointerToRawData should normally be "
"a multiple of FileAlignment, this might imply the file "
"is trying to confuse tools which parse this incorrectly."
)
)
if simultaneous_errors >= MAX_SIMULTANEOUS_ERRORS:
self.__warnings.append("Too many warnings parsing section. Aborting.")
break
section_flags = retrieve_flags(SECTION_CHARACTERISTICS, "IMAGE_SCN_")
# Set the section's flags according the the Characteristics member
set_flags(section, section.Characteristics, section_flags)
if section.__dict__.get(
"IMAGE_SCN_MEM_WRITE", False
) and section.__dict__.get("IMAGE_SCN_MEM_EXECUTE", False):
if section.Name.rstrip(b"\x00") == b"PAGE" and self.is_driver():
# Drivers can have a PAGE section with those flags set without
# implying that it is malicious
pass
else:
self.__warnings.append(
f"Suspicious flags set for section {i}. "
"Both IMAGE_SCN_MEM_WRITE and IMAGE_SCN_MEM_EXECUTE are set. "
"This might indicate a packed executable."
)
self.sections.append(section)
# Sort the sections by their VirtualAddress and add a field to each of them
# with the VirtualAddress of the next section. This will allow to check
# for potentially overlapping sections in badly constructed PEs.
self.sections.sort(key=lambda a: a.VirtualAddress)
for idx, section in enumerate(self.sections):
if idx == len(self.sections) - 1:
section.next_section_virtual_address = None
else:
section.next_section_virtual_address = self.sections[
idx + 1
].VirtualAddress
if self.FILE_HEADER.NumberOfSections > 0 and self.sections:
return (
offset + self.sections[0].sizeof() * self.FILE_HEADER.NumberOfSections
)
else:
return offset
def parse_data_directories(
self, directories=None, forwarded_exports_only=False, import_dllnames_only=False
):
"""Parse and process the PE file's data directories.
If the optional argument 'directories' is given, only
the directories at the specified indexes will be parsed.
Such functionality allows parsing of areas of interest
without the burden of having to parse all others.
The directories can then be specified as:
For export / import only:
directories = [ 0, 1 ]
or (more verbosely):
directories = [ DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_IMPORT'],
DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_EXPORT'] ]
If 'directories' is a list, the ones that are processed will be removed,
leaving only the ones that are not present in the image.
If `forwarded_exports_only` is True, the IMAGE_DIRECTORY_ENTRY_EXPORT
attribute will only contain exports that are forwarded to another DLL.
If `import_dllnames_only` is True, symbols will not be parsed from
the import table and the entries in the IMAGE_DIRECTORY_ENTRY_IMPORT
attribute will not have a `symbols` attribute.
"""
directory_parsing = (
("IMAGE_DIRECTORY_ENTRY_IMPORT", self.parse_import_directory),
("IMAGE_DIRECTORY_ENTRY_EXPORT", self.parse_export_directory),
("IMAGE_DIRECTORY_ENTRY_RESOURCE", self.parse_resources_directory),
("IMAGE_DIRECTORY_ENTRY_DEBUG", self.parse_debug_directory),
("IMAGE_DIRECTORY_ENTRY_BASERELOC", self.parse_relocations_directory),
("IMAGE_DIRECTORY_ENTRY_TLS", self.parse_directory_tls),
("IMAGE_DIRECTORY_ENTRY_LOAD_CONFIG", self.parse_directory_load_config),
("IMAGE_DIRECTORY_ENTRY_DELAY_IMPORT", self.parse_delay_import_directory),
("IMAGE_DIRECTORY_ENTRY_BOUND_IMPORT", self.parse_directory_bound_imports),
("IMAGE_DIRECTORY_ENTRY_EXCEPTION", self.parse_exceptions_directory),
)
if directories is not None:
if not isinstance(directories, (tuple, list)):
directories = [directories]
for entry in directory_parsing:
# OC Patch:
#
try:
directory_index = DIRECTORY_ENTRY[entry[0]]
dir_entry = self.OPTIONAL_HEADER.DATA_DIRECTORY[directory_index]
except IndexError:
break
# Only process all the directories if no individual ones have
# been chosen
#
if directories is None or directory_index in directories:
value = None
if dir_entry.VirtualAddress:
if (
forwarded_exports_only
and entry[0] == "IMAGE_DIRECTORY_ENTRY_EXPORT"
):
value = entry[1](
dir_entry.VirtualAddress,
dir_entry.Size,
forwarded_only=True,
)
elif (
import_dllnames_only
and entry[0] == "IMAGE_DIRECTORY_ENTRY_IMPORT"
):
value = entry[1](
dir_entry.VirtualAddress, dir_entry.Size, dllnames_only=True
)
else:
try:
value = entry[1](dir_entry.VirtualAddress, dir_entry.Size)
except PEFormatError as excp:
self.__warnings.append(
f'Failed to process directoty "{entry[0]}": {excp}'
)
if value:
setattr(self, entry[0][6:], value)
if (
(directories is not None)
and isinstance(directories, list)
and (entry[0] in directories)
):
directories.remove(directory_index)
def parse_exceptions_directory(self, rva, size):
"""Parses exception directory
All the code related to handling exception directories is documented in
https://auscitte.github.io/systems%20blog/Exception-Directory-pefile#implementation-details
"""
# "For x64 and Itanium platforms; the format is different for other platforms"
if (
self.FILE_HEADER.Machine != MACHINE_TYPE["IMAGE_FILE_MACHINE_AMD64"]
and self.FILE_HEADER.Machine != MACHINE_TYPE["IMAGE_FILE_MACHINE_IA64"]
):
return None
rf = Structure(self.__RUNTIME_FUNCTION_format__)
rf_size = rf.sizeof()
rva2rt = {}
rt_funcs = []
rva2infos = {}
for _ in range(size // rf_size):
rf = self.__unpack_data__(
self.__RUNTIME_FUNCTION_format__,
self.get_data(rva, rf_size),
file_offset=self.get_offset_from_rva(rva),
)
if rf is None:
break
ui = None
if (rf.UnwindData & 0x1) == 0:
# according to "Improving Automated Analysis of Windows x64 Binaries",
# if the lowest bit is set, (UnwindData & ~0x1) should point to the
# chained RUNTIME_FUNCTION instead of UNWIND_INFO
if (
rf.UnwindData in rva2infos
): # unwind info data structures can be shared among functions
ui = rva2infos[rf.UnwindData]
else:
ui = UnwindInfo(file_offset=self.get_offset_from_rva(rf.UnwindData))
rva2infos[rf.UnwindData] = ui
ws = ui.unpack_in_stages(self.get_data(rf.UnwindData, ui.sizeof()))
if ws != None:
self.__warnings.append(ws)
break
ws = ui.unpack_in_stages(self.get_data(rf.UnwindData, ui.sizeof()))
if ws != None:
self.__warnings.append(ws)
break
self.__structures__.append(ui)
entry = ExceptionsDirEntryData(struct=rf, unwindinfo=ui)
rt_funcs.append(entry)
rva2rt[rf.BeginAddress] = entry
rva += rf_size
# each chained function entry holds a reference to the function first in chain
for rf in rt_funcs:
if rf.unwindinfo is None:
# TODO: have not encountered such a binary yet;
# in theory, (UnwindData & ~0x1) should point to the chained
# RUNTIME_FUNCTION which could be used to locate the corresponding
# ExceptionsDirEntryData and set_chained_function_entry()
continue
if not hasattr(rf.unwindinfo, "FunctionEntry"):
continue
if not rf.unwindinfo.FunctionEntry in rva2rt:
self.__warnings.append(
f"FunctionEntry of UNWIND_INFO at {rf.struct.get_file_offset():x}"
" points to an entry that does not exist"
)
continue
try:
rf.unwindinfo.set_chained_function_entry(
rva2rt[rf.unwindinfo.FunctionEntry]
)
except PEFormatError as excp:
self.__warnings.append(
"Failed parsing FunctionEntry of UNWIND_INFO at "
f"{rf.struct.get_file_offset():x}: {excp}"
)
continue
return rt_funcs
def parse_directory_bound_imports(self, rva, size):
""""""
bnd_descr = Structure(self.__IMAGE_BOUND_IMPORT_DESCRIPTOR_format__)
bnd_descr_size = bnd_descr.sizeof()
start = rva
bound_imports = []
while True:
bnd_descr = self.__unpack_data__(
self.__IMAGE_BOUND_IMPORT_DESCRIPTOR_format__,
self.__data__[rva : rva + bnd_descr_size],
file_offset=rva,
)
if bnd_descr is None:
# If can't parse directory then silently return.
# This directory does not necessarily have to be valid to
# still have a valid PE file
self.__warnings.append(
"The Bound Imports directory exists but can't be parsed."
)
return
if bnd_descr.all_zeroes():
break
rva += bnd_descr.sizeof()
section = self.get_section_by_offset(rva)
file_offset = self.get_offset_from_rva(rva)
if section is None:
safety_boundary = len(self.__data__) - file_offset
sections_after_offset = [
s.PointerToRawData
for s in self.sections
if s.PointerToRawData > file_offset
]
if sections_after_offset:
# Find the first section starting at a later offset than that
# specified by 'rva'
first_section_after_offset = min(sections_after_offset)
section = self.get_section_by_offset(first_section_after_offset)
if section is not None:
safety_boundary = section.PointerToRawData - file_offset
else:
safety_boundary = (
section.PointerToRawData + len(section.get_data()) - file_offset
)
if not section:
self.__warnings.append(
(
"RVA of IMAGE_BOUND_IMPORT_DESCRIPTOR points "
"to an invalid address: {0:x}"
).format(rva)
)
return
forwarder_refs = []
# 8 is the size of __IMAGE_BOUND_IMPORT_DESCRIPTOR_format__
for _ in range(
min(bnd_descr.NumberOfModuleForwarderRefs, int(safety_boundary / 8))
):
# Both structures IMAGE_BOUND_IMPORT_DESCRIPTOR and
# IMAGE_BOUND_FORWARDER_REF have the same size.
bnd_frwd_ref = self.__unpack_data__(
self.__IMAGE_BOUND_FORWARDER_REF_format__,
self.__data__[rva : rva + bnd_descr_size],
file_offset=rva,
)
# OC Patch:
if not bnd_frwd_ref:
raise PEFormatError("IMAGE_BOUND_FORWARDER_REF cannot be read")
rva += bnd_frwd_ref.sizeof()
offset = start + bnd_frwd_ref.OffsetModuleName
name_str = self.get_string_from_data(
0, self.__data__[offset : offset + MAX_STRING_LENGTH]
)
# OffsetModuleName points to a DLL name. These shouldn't be too long.
# Anything longer than a safety length of 128 will be taken to indicate
# a corrupt entry and abort the processing of these entries.
# Names shorter than 4 characters will be taken as invalid as well.
if name_str:
invalid_chars = [
c for c in bytearray(name_str) if chr(c) not in string.printable
]
if len(name_str) > 256 or invalid_chars:
break
forwarder_refs.append(
BoundImportRefData(struct=bnd_frwd_ref, name=name_str)
)
offset = start + bnd_descr.OffsetModuleName
name_str = self.get_string_from_data(
0, self.__data__[offset : offset + MAX_STRING_LENGTH]
)
if name_str:
invalid_chars = [
c for c in bytearray(name_str) if chr(c) not in string.printable
]
if len(name_str) > 256 or invalid_chars:
break
if not name_str:
break
bound_imports.append(
BoundImportDescData(
struct=bnd_descr, name=name_str, entries=forwarder_refs
)
)
return bound_imports
def parse_directory_tls(self, rva, size):
""""""
# By default let's pretend the format is a 32-bit PE. It may help
# produce some output for files where the Magic in the Optional Header
# is incorrect.
format = self.__IMAGE_TLS_DIRECTORY_format__
if self.PE_TYPE == OPTIONAL_HEADER_MAGIC_PE_PLUS:
format = self.__IMAGE_TLS_DIRECTORY64_format__
try:
tls_struct = self.__unpack_data__(
format,
self.get_data(rva, Structure(format).sizeof()),
file_offset=self.get_offset_from_rva(rva),
)
except PEFormatError:
self.__warnings.append(
"Invalid TLS information. Can't read " "data at RVA: 0x%x" % rva
)
tls_struct = None
if not tls_struct:
return None
return TlsData(struct=tls_struct)
def parse_directory_load_config(self, rva, size):
""""""
if self.PE_TYPE == OPTIONAL_HEADER_MAGIC_PE:
load_config_dir_sz = self.get_dword_at_rva(rva)
format = self.__IMAGE_LOAD_CONFIG_DIRECTORY_format__
elif self.PE_TYPE == OPTIONAL_HEADER_MAGIC_PE_PLUS:
load_config_dir_sz = self.get_dword_at_rva(rva)
format = self.__IMAGE_LOAD_CONFIG_DIRECTORY64_format__
else:
self.__warnings.append(
"Don't know how to parse LOAD_CONFIG information for non-PE32/"
"PE32+ file"
)
return None
# load config directory size can be less than represented by 'format' variable,
# generate truncated format which correspond load config directory size
fields_counter = 0
cumulative_sz = 0
for field in format[1]:
fields_counter += 1
cumulative_sz += STRUCT_SIZEOF_TYPES[field.split(",")[0]]
if cumulative_sz == load_config_dir_sz:
break
format = (format[0], format[1][:fields_counter])
load_config_struct = None
try:
load_config_struct = self.__unpack_data__(
format,
self.get_data(rva, Structure(format).sizeof()),
file_offset=self.get_offset_from_rva(rva),
)
except PEFormatError:
self.__warnings.append(
"Invalid LOAD_CONFIG information. Can't read " "data at RVA: 0x%x" % rva
)
if not load_config_struct:
return None
dynamic_relocations = None
if fields_counter > 35:
dynamic_relocations = self.parse_dynamic_relocations(
load_config_struct.DynamicValueRelocTableOffset,
load_config_struct.DynamicValueRelocTableSection,
)
return LoadConfigData(
struct=load_config_struct, dynamic_relocations=dynamic_relocations
)
def parse_dynamic_relocations(
self, dynamic_value_reloc_table_offset, dynamic_value_reloc_table_section
):
if not dynamic_value_reloc_table_offset:
return None
if not dynamic_value_reloc_table_section:
return None
if dynamic_value_reloc_table_section > len(self.sections):
return None
section = self.sections[dynamic_value_reloc_table_section - 1]
rva = section.VirtualAddress + dynamic_value_reloc_table_offset
image_dynamic_reloc_table_struct = None
reloc_table_size = Structure(
self.__IMAGE_DYNAMIC_RELOCATION_TABLE_format__
).sizeof()
try:
image_dynamic_reloc_table_struct = self.__unpack_data__(
self.__IMAGE_DYNAMIC_RELOCATION_TABLE_format__,
self.get_data(rva, reloc_table_size),
file_offset=self.get_offset_from_rva(rva),
)
except PEFormatError:
self.__warnings.append(
"Invalid IMAGE_DYNAMIC_RELOCATION_TABLE information. Can't read "
"data at RVA: 0x%x" % rva
)
if image_dynamic_reloc_table_struct.Version != 1:
self.__warnings.append(
"No pasring available for IMAGE_DYNAMIC_RELOCATION_TABLE.Version = %d",
image_dynamic_reloc_table_struct.Version,
)
return None
rva += reloc_table_size
end = rva + image_dynamic_reloc_table_struct.Size
dynamic_relocations = []
while rva < end:
format = self.__IMAGE_DYNAMIC_RELOCATION_format__
if self.PE_TYPE == OPTIONAL_HEADER_MAGIC_PE_PLUS:
format = self.__IMAGE_DYNAMIC_RELOCATION64_format__
rlc_size = Structure(format).sizeof()
try:
dynamic_rlc = self.__unpack_data__(
format,
self.get_data(rva, rlc_size),
file_offset=self.get_offset_from_rva(rva),
)
except PEFormatError:
self.__warnings.append(
"Invalid relocation information. Can't read "
"data at RVA: 0x%x" % rva
)
dynamic_rlc = None
if not dynamic_rlc:
break
rva += rlc_size
symbol = dynamic_rlc.Symbol
size = dynamic_rlc.BaseRelocSize
if 3 <= symbol <= 5:
relocations = self.parse_image_base_relocation_list(
rva, size, self.dynamic_relocation_format_by_symbol[symbol]
)
dynamic_relocations.append(
DynamicRelocationData(
struct=dynamic_rlc, symbol=symbol, relocations=relocations
)
)
if symbol > 5:
relocations = self.parse_image_base_relocation_list(rva, size)
dynamic_relocations.append(
DynamicRelocationData(
struct=dynamic_rlc, symbol=symbol, relocations=relocations
)
)
rva += size
return dynamic_relocations
def parse_relocations_directory(self, rva, size):
""""""
return self.parse_image_base_relocation_list(rva, size)
def parse_image_base_relocation_list(self, rva, size, fmt=None):
rlc_size = Structure(self.__IMAGE_BASE_RELOCATION_format__).sizeof()
end = rva + size
relocations = []
while rva < end:
# OC Patch:
# Malware that has bad RVA entries will cause an error.
# Just continue on after an exception
#
try:
rlc = self.__unpack_data__(
self.__IMAGE_BASE_RELOCATION_format__,
self.get_data(rva, rlc_size),
file_offset=self.get_offset_from_rva(rva),
)
except PEFormatError:
self.__warnings.append(
"Invalid relocation information. Can't read "
"data at RVA: 0x%x" % rva
)
rlc = None
if not rlc:
break
# rlc.VirtualAddress must lie within the Image
if rlc.VirtualAddress > self.OPTIONAL_HEADER.SizeOfImage:
self.__warnings.append(
"Invalid relocation information. VirtualAddress outside"
" of Image: 0x%x" % rlc.VirtualAddress
)
break
# rlc.SizeOfBlock must be less or equal than the size of the image
# (It's a rather loose sanity test)
if rlc.SizeOfBlock > self.OPTIONAL_HEADER.SizeOfImage:
self.__warnings.append(
"Invalid relocation information. SizeOfBlock too large"
": %d" % rlc.SizeOfBlock
)
break
if fmt is None:
reloc_entries = self.parse_relocations(
rva + rlc_size, rlc.VirtualAddress, rlc.SizeOfBlock - rlc_size
)
else:
reloc_entries = self.parse_relocations_with_format(
rva + rlc_size, rlc.VirtualAddress, rlc.SizeOfBlock - rlc_size, fmt
)
relocations.append(BaseRelocationData(struct=rlc, entries=reloc_entries))
if not rlc.SizeOfBlock:
break
rva += rlc.SizeOfBlock
return relocations
def parse_relocations(self, data_rva, rva, size):
""""""
try:
data = self.get_data(data_rva, size)
file_offset = self.get_offset_from_rva(data_rva)
except PEFormatError:
self.__warnings.append(f"Bad RVA in relocation data: 0x{data_rva:x}")
return []
entries = []
offsets_and_type = set()
for idx in range(int(len(data) / 2)):
entry = self.__unpack_data__(
self.__IMAGE_BASE_RELOCATION_ENTRY_format__,
data[idx * 2 : (idx + 1) * 2],
file_offset=file_offset,
)
if not entry:
break
word = entry.Data
reloc_type = word >> 12
reloc_offset = word & 0x0FFF
if (reloc_offset, reloc_type) in offsets_and_type:
self.__warnings.append(
"Overlapping offsets in relocation data "
"at RVA: 0x%x" % (reloc_offset + rva)
)
break
offsets_and_type.add((reloc_offset, reloc_type))
entries.append(
RelocationData(
struct=entry, type=reloc_type, base_rva=rva, rva=reloc_offset + rva
)
)
file_offset += entry.sizeof()
return entries
def parse_relocations_with_format(self, data_rva, rva, size, format):
""""""
try:
data = self.get_data(data_rva, size)
file_offset = self.get_offset_from_rva(data_rva)
except PEFormatError:
self.__warnings.append(f"Bad RVA in relocation data: 0x{data_rva:x}")
return []
entry_size = StructureWithBitfields(format).sizeof()
entries = []
offsets = set()
for idx in range(int(len(data) / entry_size)):
entry = self.__unpack_data_with_bitfields__(
format,
data[idx * entry_size : (idx + 1) * entry_size],
file_offset=file_offset,
)
if not entry:
break
reloc_offset = entry.PageRelativeOffset
if reloc_offset in offsets:
self.__warnings.append(
"Overlapping offsets in relocation data "
"at RVA: 0x%x" % (reloc_offset + rva)
)
break
offsets.add(reloc_offset)
entries.append(
RelocationData(struct=entry, base_rva=rva, rva=reloc_offset + rva)
)
file_offset += entry_size
return entries
def parse_debug_directory(self, rva, size):
""""""
dbg_size = Structure(self.__IMAGE_DEBUG_DIRECTORY_format__).sizeof()
debug = []
for idx in range(int(size / dbg_size)):
try:
data = self.get_data(rva + dbg_size * idx, dbg_size)
except PEFormatError:
self.__warnings.append(
"Invalid debug information. Can't read " "data at RVA: 0x%x" % rva
)
return None
dbg = self.__unpack_data__(
self.__IMAGE_DEBUG_DIRECTORY_format__,
data,
file_offset=self.get_offset_from_rva(rva + dbg_size * idx),
)
if not dbg:
return None
# apply structure according to DEBUG_TYPE
# http://www.debuginfo.com/articles/debuginfomatch.html
#
dbg_type = None
if dbg.Type == 1:
# IMAGE_DEBUG_TYPE_COFF
pass
elif dbg.Type == 2:
# if IMAGE_DEBUG_TYPE_CODEVIEW
dbg_type_offset = dbg.PointerToRawData
dbg_type_size = dbg.SizeOfData
dbg_type_data = self.__data__[
dbg_type_offset : dbg_type_offset + dbg_type_size
]
if dbg_type_data[:4] == b"RSDS":
# pdb7.0
__CV_INFO_PDB70_format__ = [
"CV_INFO_PDB70",
[
"4s,CvSignature",
"I,Signature_Data1", # Signature is of GUID type
"H,Signature_Data2",
"H,Signature_Data3",
"B,Signature_Data4",
"B,Signature_Data5",
"6s,Signature_Data6",
"I,Age",
],
]
pdbFileName_size = (
dbg_type_size - Structure(__CV_INFO_PDB70_format__).sizeof()
)
# pdbFileName_size can be negative here, as seen in the malware
# sample with hash
# MD5: 7c297600870d026c014d42596bb9b5fd
# SHA256:
# 83f4e63681fcba8a9d7bbb1688c71981b1837446514a1773597e0192bba9fac3
# Checking for positive size here to ensure proper parsing.
if pdbFileName_size > 0:
__CV_INFO_PDB70_format__[1].append(
"{0}s,PdbFileName".format(pdbFileName_size)
)
dbg_type = self.__unpack_data__(
__CV_INFO_PDB70_format__, dbg_type_data, dbg_type_offset
)
if dbg_type is not None:
dbg_type.Signature_Data6_value = struct.unpack(
">Q", b"\0\0" + dbg_type.Signature_Data6
)[0]
dbg_type.Signature_String = (
str(
uuid.UUID(
fields=(
dbg_type.Signature_Data1,
dbg_type.Signature_Data2,
dbg_type.Signature_Data3,
dbg_type.Signature_Data4,
dbg_type.Signature_Data5,
dbg_type.Signature_Data6_value,
)
)
)
.replace("-", "")
.upper()
+ f"{dbg_type.Age:X}"
)
elif dbg_type_data[:4] == b"NB10":
# pdb2.0
__CV_INFO_PDB20_format__ = [
"CV_INFO_PDB20",
[
"I,CvHeaderSignature",
"I,CvHeaderOffset",
"I,Signature",
"I,Age",
],
]
pdbFileName_size = (
dbg_type_size - Structure(__CV_INFO_PDB20_format__).sizeof()
)
# As with the PDB 7.0 case, ensuring a positive size for
# pdbFileName_size to ensure proper parsing.
if pdbFileName_size > 0:
# Add the last variable-length string field.
__CV_INFO_PDB20_format__[1].append(
"{0}s,PdbFileName".format(pdbFileName_size)
)
dbg_type = self.__unpack_data__(
__CV_INFO_PDB20_format__, dbg_type_data, dbg_type_offset
)
elif dbg.Type == 4:
# IMAGE_DEBUG_TYPE_MISC
dbg_type_offset = dbg.PointerToRawData
dbg_type_size = dbg.SizeOfData
dbg_type_data = self.__data__[
dbg_type_offset : dbg_type_offset + dbg_type_size
]
___IMAGE_DEBUG_MISC_format__ = [
"IMAGE_DEBUG_MISC",
[
"I,DataType",
"I,Length",
"B,Unicode",
"B,Reserved1",
"H,Reserved2",
],
]
dbg_type_partial = self.__unpack_data__(
___IMAGE_DEBUG_MISC_format__, dbg_type_data, dbg_type_offset
)
# Need to check that dbg_type_partial contains a correctly unpacked data
# structure, as the malware sample with the following hash
# MD5: 5e7d6707d693108de5a303045c17d95b
# SHA256:
# 5dd94a95025f3b6e3dd440d52f7c6d2964fdd1aa119e0ee92e38c7bf83829e5c
# contains a value of None for dbg_type_partial after unpacking,
# presumably due to a malformed DEBUG entry.
if dbg_type_partial:
# The Unicode bool should be set to 0 or 1.
if dbg_type_partial.Unicode in (0, 1):
data_size = (
dbg_type_size
- Structure(___IMAGE_DEBUG_MISC_format__).sizeof()
)
# As with the PDB case, ensuring a positive size for data_size
# here to ensure proper parsing.
if data_size > 0:
___IMAGE_DEBUG_MISC_format__[1].append(
"{0}s,Data".format(data_size)
)
dbg_type = self.__unpack_data__(
___IMAGE_DEBUG_MISC_format__, dbg_type_data, dbg_type_offset
)
debug.append(DebugData(struct=dbg, entry=dbg_type))
return debug
def parse_resources_directory(self, rva, size=0, base_rva=None, level=0, dirs=None):
"""Parse the resources directory.
Given the RVA of the resources directory, it will process all
its entries.
The root will have the corresponding member of its structure,
IMAGE_RESOURCE_DIRECTORY plus 'entries', a list of all the
entries in the directory.
Those entries will have, correspondingly, all the structure's
members (IMAGE_RESOURCE_DIRECTORY_ENTRY) and an additional one,
"directory", pointing to the IMAGE_RESOURCE_DIRECTORY structure
representing upper layers of the tree. This one will also have
an 'entries' attribute, pointing to the 3rd, and last, level.
Another directory with more entries. Those last entries will
have a new attribute (both 'leaf' or 'data_entry' can be used to
access it). This structure finally points to the resource data.
All the members of this structure, IMAGE_RESOURCE_DATA_ENTRY,
are available as its attributes.
"""
# OC Patch:
if dirs is None:
dirs = [rva]
if base_rva is None:
base_rva = rva
if level > MAX_RESOURCE_DEPTH:
self.__warnings.append(
"Error parsing the resources directory. "
"Excessively nested table depth %d (>%s)" % (level, MAX_RESOURCE_DEPTH)
)
return None
try:
# If the RVA is invalid all would blow up. Some EXEs seem to be
# specially nasty and have an invalid RVA.
data = self.get_data(
rva, Structure(self.__IMAGE_RESOURCE_DIRECTORY_format__).sizeof()
)
except PEFormatError:
self.__warnings.append(
"Invalid resources directory. Can't read "
"directory data at RVA: 0x%x" % rva
)
return None
# Get the resource directory structure, that is, the header
# of the table preceding the actual entries
#
resource_dir = self.__unpack_data__(
self.__IMAGE_RESOURCE_DIRECTORY_format__,
data,
file_offset=self.get_offset_from_rva(rva),
)
if resource_dir is None:
# If we can't parse resources directory then silently return.
# This directory does not necessarily have to be valid to
# still have a valid PE file
self.__warnings.append(
"Invalid resources directory. Can't parse "
"directory data at RVA: 0x%x" % rva
)
return None
dir_entries = []
# Advance the RVA to the position immediately following the directory
# table header and pointing to the first entry in the table
#
rva += resource_dir.sizeof()
number_of_entries = (
resource_dir.NumberOfNamedEntries + resource_dir.NumberOfIdEntries
)
# Set a hard limit on the maximum reasonable number of entries
MAX_ALLOWED_ENTRIES = 4096
if number_of_entries > MAX_ALLOWED_ENTRIES:
self.__warnings.append(
"Error parsing the resources directory. "
"The directory contains %d entries (>%s)"
% (number_of_entries, MAX_ALLOWED_ENTRIES)
)
return None
self.__total_resource_entries_count += number_of_entries
if self.__total_resource_entries_count > MAX_RESOURCE_ENTRIES:
self.__warnings.append(
"Error parsing the resources directory. "
"The file contains at least %d entries (>%d)"
% (self.__total_resource_entries_count, MAX_RESOURCE_ENTRIES)
)
return None
strings_to_postprocess = []
# Keep track of the last name's start and end offsets in order
# to be able to detect overlapping entries that might suggest
# and invalid or corrupt directory.
last_name_begin_end = None
for idx in range(number_of_entries):
if (
not self.__resource_size_limit_reached
and self.__total_resource_bytes > self.__resource_size_limit_upperbounds
):
self.__resource_size_limit_reached = True
self.__warnings.append(
"Resource size 0x%x exceeds file size 0x%x, overlapping "
"resources found."
% (
self.__total_resource_bytes,
self.__resource_size_limit_upperbounds,
)
)
res = self.parse_resource_entry(rva)
if res is None:
self.__warnings.append(
"Error parsing the resources directory, "
"Entry %d is invalid, RVA = 0x%x. " % (idx, rva)
)
break
entry_name = None
entry_id = None
name_is_string = (res.Name & 0x80000000) >> 31
if not name_is_string:
entry_id = res.Name
else:
ustr_offset = base_rva + res.NameOffset
try:
entry_name = UnicodeStringWrapperPostProcessor(self, ustr_offset)
self.__total_resource_bytes += entry_name.get_pascal_16_length()
# If the last entry's offset points before the current's but its end
# is past the current's beginning, assume the overlap indicates a
# corrupt name.
if last_name_begin_end and (
last_name_begin_end[0] < ustr_offset
and last_name_begin_end[1] >= ustr_offset
):
# Remove the previous overlapping entry as it's likely to be
# already corrupt data.
strings_to_postprocess.pop()
self.__warnings.append(
"Error parsing the resources directory, "
"attempting to read entry name. "
"Entry names overlap 0x%x" % (ustr_offset)
)
break
last_name_begin_end = (
ustr_offset,
ustr_offset + entry_name.get_pascal_16_length(),
)
strings_to_postprocess.append(entry_name)
except PEFormatError:
self.__warnings.append(
"Error parsing the resources directory, "
"attempting to read entry name. "
"Can't read unicode string at offset 0x%x" % (ustr_offset)
)
if res.DataIsDirectory:
# OC Patch:
#
# One trick malware can do is to recursively reference
# the next directory. This causes hilarity to ensue when
# trying to parse everything correctly.
# If the original RVA given to this function is equal to
# the next one to parse, we assume that it's a trick.
# Instead of raising a PEFormatError this would skip some
# reasonable data so we just break.
#
# 9ee4d0a0caf095314fd7041a3e4404dc is the offending sample
if base_rva + res.OffsetToDirectory in dirs:
break
entry_directory = self.parse_resources_directory(
base_rva + res.OffsetToDirectory,
size - (rva - base_rva), # size
base_rva=base_rva,
level=level + 1,
dirs=dirs + [base_rva + res.OffsetToDirectory],
)
if not entry_directory:
break
# Ange Albertini's code to process resources' strings
#
strings = None
if entry_id == RESOURCE_TYPE["RT_STRING"]:
strings = {}
for resource_id in entry_directory.entries:
if hasattr(resource_id, "directory"):
resource_strings = {}
for resource_lang in resource_id.directory.entries:
if (
resource_lang is None
or not hasattr(resource_lang, "data")
or resource_lang.data.struct.Size is None
or resource_id.id is None
):
continue
string_entry_rva = (
resource_lang.data.struct.OffsetToData
)
string_entry_size = resource_lang.data.struct.Size
string_entry_id = resource_id.id
# XXX: has been raising exceptions preventing parsing
try:
string_entry_data = self.get_data(
string_entry_rva, string_entry_size
)
except PEFormatError:
self.__warnings.append(
f"Error parsing resource of type RT_STRING at "
f"RVA 0x{string_entry_rva:x} with "
f"size {string_entry_size}"
)
continue
parse_strings(
string_entry_data,
(int(string_entry_id) - 1) * 16,
resource_strings,
)
strings.update(resource_strings)
resource_id.directory.strings = resource_strings
dir_entries.append(
ResourceDirEntryData(
struct=res,
name=entry_name,
id=entry_id,
directory=entry_directory,
)
)
else:
struct = self.parse_resource_data_entry(
base_rva + res.OffsetToDirectory
)
if struct:
self.__total_resource_bytes += struct.Size
entry_data = ResourceDataEntryData(
struct=struct, lang=res.Name & 0x3FF, sublang=res.Name >> 10
)
dir_entries.append(
ResourceDirEntryData(
struct=res, name=entry_name, id=entry_id, data=entry_data
)
)
else:
break
# Check if this entry contains version information
#
if level == 0 and res.Id == RESOURCE_TYPE["RT_VERSION"]:
if dir_entries:
last_entry = dir_entries[-1]
try:
version_entries = last_entry.directory.entries[0].directory.entries
except:
# Maybe a malformed directory structure...?
# Let's ignore it
pass
else:
for version_entry in version_entries:
rt_version_struct = None
try:
rt_version_struct = version_entry.data.struct
except:
# Maybe a malformed directory structure...?
# Let's ignore it
pass
if rt_version_struct is not None:
self.parse_version_information(rt_version_struct)
rva += res.sizeof()
string_rvas = [s.get_rva() for s in strings_to_postprocess]
string_rvas.sort()
for idx, s in enumerate(strings_to_postprocess):
s.render_pascal_16()
resource_directory_data = ResourceDirData(
struct=resource_dir, entries=dir_entries
)
return resource_directory_data
def parse_resource_data_entry(self, rva):
"""Parse a data entry from the resources directory."""
try:
# If the RVA is invalid all would blow up. Some EXEs seem to be
# specially nasty and have an invalid RVA.
data = self.get_data(
rva, Structure(self.__IMAGE_RESOURCE_DATA_ENTRY_format__).sizeof()
)
except PEFormatError:
self.__warnings.append(
"Error parsing a resource directory data entry, "
"the RVA is invalid: 0x%x" % (rva)
)
return None
data_entry = self.__unpack_data__(
self.__IMAGE_RESOURCE_DATA_ENTRY_format__,
data,
file_offset=self.get_offset_from_rva(rva),
)
return data_entry
def parse_resource_entry(self, rva):
"""Parse a directory entry from the resources directory."""
try:
data = self.get_data(
rva, Structure(self.__IMAGE_RESOURCE_DIRECTORY_ENTRY_format__).sizeof()
)
except PEFormatError:
# A warning will be added by the caller if this method returns None
return None
resource = self.__unpack_data__(
self.__IMAGE_RESOURCE_DIRECTORY_ENTRY_format__,
data,
file_offset=self.get_offset_from_rva(rva),
)
if resource is None:
return None
# resource.NameIsString = (resource.Name & 0x80000000L) >> 31
resource.NameOffset = resource.Name & 0x7FFFFFFF
resource.__pad = resource.Name & 0xFFFF0000
resource.Id = resource.Name & 0x0000FFFF
resource.DataIsDirectory = (resource.OffsetToData & 0x80000000) >> 31
resource.OffsetToDirectory = resource.OffsetToData & 0x7FFFFFFF
return resource
def parse_version_information(self, version_struct):
"""Parse version information structure.
The date will be made available in three attributes of the PE object.
VS_VERSIONINFO will contain the first three fields of the main structure:
'Length', 'ValueLength', and 'Type'
VS_FIXEDFILEINFO will hold the rest of the fields, accessible as sub-attributes:
'Signature', 'StrucVersion', 'FileVersionMS', 'FileVersionLS',
'ProductVersionMS', 'ProductVersionLS', 'FileFlagsMask', 'FileFlags',
'FileOS', 'FileType', 'FileSubtype', 'FileDateMS', 'FileDateLS'
FileInfo is a list of all StringFileInfo and VarFileInfo structures.
StringFileInfo structures will have a list as an attribute named 'StringTable'
containing all the StringTable structures. Each of those structures contains a
dictionary 'entries' with all the key / value version information string pairs.
VarFileInfo structures will have a list as an attribute named 'Var' containing
all Var structures. Each Var structure will have a dictionary as an attribute
named 'entry' which will contain the name and value of the Var.
"""
# Retrieve the data for the version info resource
#
try:
start_offset = self.get_offset_from_rva(version_struct.OffsetToData)
except PEFormatError:
self.__warnings.append(
"Error parsing the version information, "
"attempting to read OffsetToData with RVA: 0x{:x}".format(
version_struct.OffsetToData
)
)
return
raw_data = self.__data__[start_offset : start_offset + version_struct.Size]
# Map the main structure and the subsequent string
#
versioninfo_struct = self.__unpack_data__(
self.__VS_VERSIONINFO_format__, raw_data, file_offset=start_offset
)
if versioninfo_struct is None:
return
ustr_offset = version_struct.OffsetToData + versioninfo_struct.sizeof()
section = self.get_section_by_rva(ustr_offset)
section_end = None
if section:
section_end = section.VirtualAddress + max(
section.SizeOfRawData, section.Misc_VirtualSize
)
versioninfo_string = None
# These should return 'ascii' decoded data. For the case when it's
# garbled data the ascii string will retain the byte values while
# encoding it to something else may yield values that don't match the
# file's contents.
try:
if section_end is None:
versioninfo_string = self.get_string_u_at_rva(
ustr_offset, encoding="ascii"
)
else:
versioninfo_string = self.get_string_u_at_rva(
ustr_offset, (section_end - ustr_offset) >> 1, encoding="ascii"
)
except PEFormatError:
self.__warnings.append(
"Error parsing the version information, "
"attempting to read VS_VERSION_INFO string. Can't "
"read unicode string at offset 0x%x" % (ustr_offset)
)
if versioninfo_string is None:
self.__warnings.append(
"Invalid VS_VERSION_INFO block: {0}".format(versioninfo_string)
)
return
# If the structure does not contain the expected name, it's assumed to
# be invalid
if versioninfo_string is not None and versioninfo_string != b"VS_VERSION_INFO":
if len(versioninfo_string) > 128:
excerpt = versioninfo_string[:128].decode("ascii")
# Don't leave any half-escaped characters
excerpt = excerpt[: excerpt.rfind("\\u")]
versioninfo_string = b(
"{0} ... ({1} bytes, too long to display)".format(
excerpt, len(versioninfo_string)
)
)
self.__warnings.append(
"Invalid VS_VERSION_INFO block: {0}".format(
versioninfo_string.decode("ascii").replace("\00", "\\00")
)
)
return
if not hasattr(self, "VS_VERSIONINFO"):
self.VS_VERSIONINFO = []
# Set the PE object's VS_VERSIONINFO to this one
vinfo = versioninfo_struct
# Set the Key attribute to point to the unicode string identifying the structure
vinfo.Key = versioninfo_string
self.VS_VERSIONINFO.append(vinfo)
if versioninfo_string is None:
versioninfo_string = ""
# Process the fixed version information, get the offset and structure
fixedfileinfo_offset = self.dword_align(
versioninfo_struct.sizeof() + 2 * (len(versioninfo_string) + 1),
version_struct.OffsetToData,
)
fixedfileinfo_struct = self.__unpack_data__(
self.__VS_FIXEDFILEINFO_format__,
raw_data[fixedfileinfo_offset:],
file_offset=start_offset + fixedfileinfo_offset,
)
if not fixedfileinfo_struct:
return
if not hasattr(self, "VS_FIXEDFILEINFO"):
self.VS_FIXEDFILEINFO = []
# Set the PE object's VS_FIXEDFILEINFO to this one
self.VS_FIXEDFILEINFO.append(fixedfileinfo_struct)
# Start parsing all the StringFileInfo and VarFileInfo structures
# Get the first one
stringfileinfo_offset = self.dword_align(
fixedfileinfo_offset + fixedfileinfo_struct.sizeof(),
version_struct.OffsetToData,
)
# Set the PE object's attribute that will contain them all.
if not hasattr(self, "FileInfo"):
self.FileInfo = []
finfo = []
while True:
# Process the StringFileInfo/VarFileInfo structure
stringfileinfo_struct = self.__unpack_data__(
self.__StringFileInfo_format__,
raw_data[stringfileinfo_offset:],
file_offset=start_offset + stringfileinfo_offset,
)
if stringfileinfo_struct is None:
self.__warnings.append(
"Error parsing StringFileInfo/VarFileInfo struct"
)
return None
# Get the subsequent string defining the structure.
ustr_offset = (
version_struct.OffsetToData
+ stringfileinfo_offset
+ versioninfo_struct.sizeof()
)
try:
stringfileinfo_string = self.get_string_u_at_rva(ustr_offset)
except PEFormatError:
self.__warnings.append(
"Error parsing the version information, "
"attempting to read StringFileInfo string. Can't "
"read unicode string at offset 0x{0:x}".format(ustr_offset)
)
break
# Set such string as the Key attribute
stringfileinfo_struct.Key = stringfileinfo_string
# Append the structure to the PE object's list
finfo.append(stringfileinfo_struct)
# Parse a StringFileInfo entry
if stringfileinfo_string and stringfileinfo_string.startswith(
b"StringFileInfo"
):
if (
stringfileinfo_struct.Type in (0, 1)
and stringfileinfo_struct.ValueLength == 0
):
stringtable_offset = self.dword_align(
stringfileinfo_offset
+ stringfileinfo_struct.sizeof()
+ 2 * (len(stringfileinfo_string) + 1),
version_struct.OffsetToData,
)
stringfileinfo_struct.StringTable = []
# Process the String Table entries
while True:
stringtable_struct = self.__unpack_data__(
self.__StringTable_format__,
raw_data[stringtable_offset:],
file_offset=start_offset + stringtable_offset,
)
if not stringtable_struct:
break
ustr_offset = (
version_struct.OffsetToData
+ stringtable_offset
+ stringtable_struct.sizeof()
)
try:
stringtable_string = self.get_string_u_at_rva(ustr_offset)
except PEFormatError:
self.__warnings.append(
"Error parsing the version information, "
"attempting to read StringTable string. Can't "
"read unicode string at offset 0x{0:x}".format(
ustr_offset
)
)
break
stringtable_struct.LangID = stringtable_string
stringtable_struct.entries = {}
stringtable_struct.entries_offsets = {}
stringtable_struct.entries_lengths = {}
stringfileinfo_struct.StringTable.append(stringtable_struct)
entry_offset = self.dword_align(
stringtable_offset
+ stringtable_struct.sizeof()
+ 2 * (len(stringtable_string) + 1),
version_struct.OffsetToData,
)
# Process all entries in the string table
while (
entry_offset
< stringtable_offset + stringtable_struct.Length
):
string_struct = self.__unpack_data__(
self.__String_format__,
raw_data[entry_offset:],
file_offset=start_offset + entry_offset,
)
if not string_struct:
break
ustr_offset = (
version_struct.OffsetToData
+ entry_offset
+ string_struct.sizeof()
)
try:
key = self.get_string_u_at_rva(ustr_offset)
key_offset = self.get_offset_from_rva(ustr_offset)
except PEFormatError:
self.__warnings.append(
"Error parsing the version information, "
"attempting to read StringTable Key string. Can't "
"read unicode string at offset 0x{0:x}".format(
ustr_offset
)
)
break
value_offset = self.dword_align(
2 * (len(key) + 1)
+ entry_offset
+ string_struct.sizeof(),
version_struct.OffsetToData,
)
ustr_offset = version_struct.OffsetToData + value_offset
try:
value = self.get_string_u_at_rva(
ustr_offset, max_length=string_struct.ValueLength
)
value_offset = self.get_offset_from_rva(ustr_offset)
except PEFormatError:
self.__warnings.append(
"Error parsing the version information, attempting "
"to read StringTable Value string. Can't read "
f"unicode string at offset 0x{ustr_offset:x}"
)
break
if string_struct.Length == 0:
entry_offset = (
stringtable_offset + stringtable_struct.Length
)
else:
entry_offset = self.dword_align(
string_struct.Length + entry_offset,
version_struct.OffsetToData,
)
stringtable_struct.entries[key] = value
stringtable_struct.entries_offsets[key] = (
key_offset,
value_offset,
)
stringtable_struct.entries_lengths[key] = (
len(key),
len(value),
)
new_stringtable_offset = self.dword_align(
stringtable_struct.Length + stringtable_offset,
version_struct.OffsetToData,
)
# Check if the entry is crafted in a way that would lead
# to an infinite loop and break if so.
if new_stringtable_offset == stringtable_offset:
break
stringtable_offset = new_stringtable_offset
if stringtable_offset >= stringfileinfo_struct.Length:
break
# Parse a VarFileInfo entry
elif stringfileinfo_string and stringfileinfo_string.startswith(
b"VarFileInfo"
):
varfileinfo_struct = stringfileinfo_struct
varfileinfo_struct.name = "VarFileInfo"
if (
varfileinfo_struct.Type in (0, 1)
and varfileinfo_struct.ValueLength == 0
):
var_offset = self.dword_align(
stringfileinfo_offset
+ varfileinfo_struct.sizeof()
+ 2 * (len(stringfileinfo_string) + 1),
version_struct.OffsetToData,
)
varfileinfo_struct.Var = []
# Process all entries
while True:
var_struct = self.__unpack_data__(
self.__Var_format__,
raw_data[var_offset:],
file_offset=start_offset + var_offset,
)
if not var_struct:
break
ustr_offset = (
version_struct.OffsetToData
+ var_offset
+ var_struct.sizeof()
)
try:
var_string = self.get_string_u_at_rva(ustr_offset)
except PEFormatError:
self.__warnings.append(
"Error parsing the version information, "
"attempting to read VarFileInfo Var string. "
"Can't read unicode string at offset 0x{0:x}".format(
ustr_offset
)
)
break
if var_string is None:
break
varfileinfo_struct.Var.append(var_struct)
varword_offset = self.dword_align(
2 * (len(var_string) + 1)
+ var_offset
+ var_struct.sizeof(),
version_struct.OffsetToData,
)
orig_varword_offset = varword_offset
while (
varword_offset
< orig_varword_offset + var_struct.ValueLength
):
word1 = self.get_word_from_data(
raw_data[varword_offset : varword_offset + 2], 0
)
word2 = self.get_word_from_data(
raw_data[varword_offset + 2 : varword_offset + 4], 0
)
varword_offset += 4
if isinstance(word1, int) and isinstance(word2, int):
var_struct.entry = {
var_string: "0x%04x 0x%04x" % (word1, word2)
}
var_offset = self.dword_align(
var_offset + var_struct.Length, version_struct.OffsetToData
)
if var_offset <= var_offset + var_struct.Length:
break
# Increment and align the offset
stringfileinfo_offset = self.dword_align(
stringfileinfo_struct.Length + stringfileinfo_offset,
version_struct.OffsetToData,
)
# Check if all the StringFileInfo and VarFileInfo items have been processed
if (
stringfileinfo_struct.Length == 0
or stringfileinfo_offset >= versioninfo_struct.Length
):
break
self.FileInfo.append(finfo)
def parse_export_directory(self, rva, size, forwarded_only=False):
"""Parse the export directory.
Given the RVA of the export directory, it will process all
its entries.
The exports will be made available as a list of ExportData
instances in the 'IMAGE_DIRECTORY_ENTRY_EXPORT' PE attribute.
"""
try:
export_dir = self.__unpack_data__(
self.__IMAGE_EXPORT_DIRECTORY_format__,
self.get_data(
rva, Structure(self.__IMAGE_EXPORT_DIRECTORY_format__).sizeof()
),
file_offset=self.get_offset_from_rva(rva),
)
except PEFormatError:
self.__warnings.append(
"Error parsing export directory at RVA: 0x%x" % (rva)
)
return
if not export_dir:
return
# We keep track of the bytes left in the file and use it to set a upper
# bound in the number of items that can be read from the different
# arrays.
def length_until_eof(rva):
return len(self.__data__) - self.get_offset_from_rva(rva)
try:
address_of_names = self.get_data(
export_dir.AddressOfNames,
min(
length_until_eof(export_dir.AddressOfNames),
export_dir.NumberOfNames * 4,
),
)
address_of_name_ordinals = self.get_data(
export_dir.AddressOfNameOrdinals,
min(
length_until_eof(export_dir.AddressOfNameOrdinals),
export_dir.NumberOfNames * 4,
),
)
address_of_functions = self.get_data(
export_dir.AddressOfFunctions,
min(
length_until_eof(export_dir.AddressOfFunctions),
export_dir.NumberOfFunctions * 4,
),
)
except PEFormatError:
self.__warnings.append(
"Error parsing export directory at RVA: 0x%x" % (rva)
)
return
exports = []
max_failed_entries_before_giving_up = 10
section = self.get_section_by_rva(export_dir.AddressOfNames)
# Overly generous upper bound
safety_boundary = len(self.__data__)
if section:
safety_boundary = (
section.VirtualAddress
+ len(section.get_data())
- export_dir.AddressOfNames
)
symbol_counts = collections.defaultdict(int)
export_parsing_loop_completed_normally = True
for i in range(min(export_dir.NumberOfNames, int(safety_boundary / 4))):
symbol_ordinal = self.get_word_from_data(address_of_name_ordinals, i)
if symbol_ordinal is not None and symbol_ordinal * 4 < len(
address_of_functions
):
symbol_address = self.get_dword_from_data(
address_of_functions, symbol_ordinal
)
else:
# Corrupt? a bad pointer... we assume it's all
# useless, no exports
return None
if symbol_address is None or symbol_address == 0:
continue
# If the function's RVA points within the export directory
# it will point to a string with the forwarded symbol's string
# instead of pointing the the function start address.
if symbol_address >= rva and symbol_address < rva + size:
forwarder_str = self.get_string_at_rva(symbol_address)
try:
forwarder_offset = self.get_offset_from_rva(symbol_address)
except PEFormatError:
continue
else:
if forwarded_only:
continue
forwarder_str = None
forwarder_offset = None
symbol_name_address = self.get_dword_from_data(address_of_names, i)
if symbol_name_address is None:
max_failed_entries_before_giving_up -= 1
if max_failed_entries_before_giving_up <= 0:
export_parsing_loop_completed_normally = False
break
symbol_name = self.get_string_at_rva(
symbol_name_address, MAX_SYMBOL_NAME_LENGTH
)
if not is_valid_function_name(symbol_name, relax_allowed_characters=True):
export_parsing_loop_completed_normally = False
break
try:
symbol_name_offset = self.get_offset_from_rva(symbol_name_address)
except PEFormatError:
max_failed_entries_before_giving_up -= 1
if max_failed_entries_before_giving_up <= 0:
export_parsing_loop_completed_normally = False
break
try:
symbol_name_offset = self.get_offset_from_rva(symbol_name_address)
except PEFormatError:
max_failed_entries_before_giving_up -= 1
if max_failed_entries_before_giving_up <= 0:
export_parsing_loop_completed_normally = False
break
continue
# File 0b1d3d3664915577ab9a32188d29bbf3542b86c7b9ce333e245496c3018819f1
# was being parsed as potentially containing millions of exports.
# Checking for duplicates addresses the issue.
symbol_counts[(symbol_name, symbol_address)] += 1
if symbol_counts[(symbol_name, symbol_address)] > 10:
self.__warnings.append(
f"Export directory contains more than 10 repeated entries "
f"({symbol_name}, {symbol_address:#02x}). Assuming corrupt."
)
break
elif len(symbol_counts) > self.max_symbol_exports:
self.__warnings.append(
"Export directory contains more than {} symbol entries. "
"Assuming corrupt.".format(self.max_symbol_exports)
)
break
exports.append(
ExportData(
pe=self,
ordinal=export_dir.Base + symbol_ordinal,
ordinal_offset=self.get_offset_from_rva(
export_dir.AddressOfNameOrdinals + 2 * i
),
address=symbol_address,
address_offset=self.get_offset_from_rva(
export_dir.AddressOfFunctions + 4 * symbol_ordinal
),
name=symbol_name,
name_offset=symbol_name_offset,
forwarder=forwarder_str,
forwarder_offset=forwarder_offset,
)
)
if not export_parsing_loop_completed_normally:
self.__warnings.append(
f"RVA AddressOfNames in the export directory points to an invalid "
f"address: {export_dir.AddressOfNames:x}"
)
ordinals = {exp.ordinal for exp in exports}
max_failed_entries_before_giving_up = 10
section = self.get_section_by_rva(export_dir.AddressOfFunctions)
# Overly generous upper bound
safety_boundary = len(self.__data__)
if section:
safety_boundary = (
section.VirtualAddress
+ len(section.get_data())
- export_dir.AddressOfFunctions
)
symbol_counts = collections.defaultdict(int)
export_parsing_loop_completed_normally = True
for idx in range(min(export_dir.NumberOfFunctions, int(safety_boundary / 4))):
if not idx + export_dir.Base in ordinals:
try:
symbol_address = self.get_dword_from_data(address_of_functions, idx)
except PEFormatError:
symbol_address = None
if symbol_address is None:
max_failed_entries_before_giving_up -= 1
if max_failed_entries_before_giving_up <= 0:
export_parsing_loop_completed_normally = False
break
if symbol_address == 0:
continue
# Checking for forwarder again.
if (
symbol_address is not None
and symbol_address >= rva
and symbol_address < rva + size
):
forwarder_str = self.get_string_at_rva(symbol_address)
else:
forwarder_str = None
# File 0b1d3d3664915577ab9a32188d29bbf3542b86c7b9ce333e245496c3018819f1
# was being parsed as potentially containing millions of exports.
# Checking for duplicates addresses the issue.
symbol_counts[symbol_address] += 1
if symbol_counts[symbol_address] > self.max_repeated_symbol:
# if most_common and most_common[0][1] > 10:
self.__warnings.append(
"Export directory contains more than {} repeated "
"ordinal entries (0x{:x}). Assuming corrupt.".format(
self.max_repeated_symbol, symbol_address
)
)
break
elif len(symbol_counts) > self.max_symbol_exports:
self.__warnings.append(
"Export directory contains more than "
f"{self.max_symbol_exports} ordinal entries. Assuming corrupt."
)
break
exports.append(
ExportData(
ordinal=export_dir.Base + idx,
address=symbol_address,
name=None,
forwarder=forwarder_str,
)
)
if not export_parsing_loop_completed_normally:
self.__warnings.append(
"RVA AddressOfFunctions in the export directory points to an invalid "
f"address: {export_dir.AddressOfFunctions:x}"
)
return
if not exports and export_dir.all_zeroes():
return None
return ExportDirData(
struct=export_dir,
symbols=exports,
name=self.get_string_at_rva(export_dir.Name),
)
def dword_align(self, offset, base):
return ((offset + base + 3) & 0xFFFFFFFC) - (base & 0xFFFFFFFC)
def normalize_import_va(self, va):
# Setup image range
begin_of_image = self.OPTIONAL_HEADER.ImageBase
end_of_image = self.OPTIONAL_HEADER.ImageBase + self.OPTIONAL_HEADER.SizeOfImage
# Try to avoid bogus VAs, which are out of the image.
# This also filters out entries that are zero
if begin_of_image <= va and va < end_of_image:
va -= begin_of_image
return va
def parse_delay_import_directory(self, rva, size):
"""Walk and parse the delay import directory."""
import_descs = []
error_count = 0
while True:
try:
# If the RVA is invalid all would blow up. Some PEs seem to be
# specially nasty and have an invalid RVA.
data = self.get_data(
rva,
Structure(self.__IMAGE_DELAY_IMPORT_DESCRIPTOR_format__).sizeof(),
)
except PEFormatError:
self.__warnings.append(
"Error parsing the Delay import directory at RVA: 0x%x" % (rva)
)
break
file_offset = self.get_offset_from_rva(rva)
import_desc = self.__unpack_data__(
self.__IMAGE_DELAY_IMPORT_DESCRIPTOR_format__,
data,
file_offset=file_offset,
)
# If the structure is all zeros, we reached the end of the list
if not import_desc or import_desc.all_zeroes():
break
contains_addresses = False
# Handle old import descriptor that has Virtual Addresses instead of RVAs
# This version of import descriptor is created by old Visual Studio versions
# (pre 6.0)
# Can only be present in 32-bit binaries (no 64-bit compiler existed at the
# time)
# Sample: e8d3bff0c1a9a6955993f7a441121a2692261421e82fdfadaaded45d3bea9980
if (
import_desc.grAttrs == 0
and self.FILE_HEADER.Machine == MACHINE_TYPE["IMAGE_FILE_MACHINE_I386"]
):
import_desc.pBoundIAT = self.normalize_import_va(import_desc.pBoundIAT)
import_desc.pIAT = self.normalize_import_va(import_desc.pIAT)
import_desc.pINT = self.normalize_import_va(import_desc.pINT)
import_desc.pUnloadIAT = self.normalize_import_va(
import_desc.pUnloadIAT
)
import_desc.phmod = self.normalize_import_va(import_desc.pUnloadIAT)
import_desc.szName = self.normalize_import_va(import_desc.szName)
contains_addresses = True
rva += import_desc.sizeof()
# If the array of thunks is somewhere earlier than the import
# descriptor we can set a maximum length for the array. Otherwise
# just set a maximum length of the size of the file
max_len = len(self.__data__) - file_offset
if rva > import_desc.pINT or rva > import_desc.pIAT:
max_len = max(rva - import_desc.pINT, rva - import_desc.pIAT)
import_data = []
try:
import_data = self.parse_imports(
import_desc.pINT,
import_desc.pIAT,
None,
max_len,
contains_addresses,
)
except PEFormatError as excp:
self.__warnings.append(
"Error parsing the Delay import directory. "
"Invalid import data at RVA: 0x{0:x} ({1})".format(rva, excp.value)
)
if error_count > 5:
self.__warnings.append(
"Too many errors parsing the Delay import directory. "
"Invalid import data at RVA: 0x{0:x}".format(rva)
)
break
if not import_data:
error_count += 1
continue
if self.__total_import_symbols > MAX_IMPORT_SYMBOLS:
self.__warnings.append(
"Error, too many imported symbols %d (>%s)"
% (self.__total_import_symbols, MAX_IMPORT_SYMBOLS)
)
break
dll = self.get_string_at_rva(import_desc.szName, MAX_DLL_LENGTH)
if not is_valid_dos_filename(dll):
dll = b("*invalid*")
if dll:
for symbol in import_data:
if symbol.name is None:
funcname = ordlookup.ordLookup(dll.lower(), symbol.ordinal)
if funcname:
symbol.name = funcname
import_descs.append(
ImportDescData(struct=import_desc, imports=import_data, dll=dll)
)
return import_descs
def get_rich_header_hash(self, algorithm="md5"):
if not hasattr(self, "RICH_HEADER") or self.RICH_HEADER is None:
return ""
if algorithm == "md5":
return md5(self.RICH_HEADER.clear_data).hexdigest()
elif algorithm == "sha1":
return sha1(self.RICH_HEADER.clear_data).hexdigest()
elif algorithm == "sha256":
return sha256(self.RICH_HEADER.clear_data).hexdigest()
elif algorithm == "sha512":
return sha512(self.RICH_HEADER.clear_data).hexdigest()
raise Exception("Invalid hashing algorithm specified")
def get_imphash(self):
"""Return the imphash of the PE file.
Creates a hash based on imported symbol names and their specific order within
the executable:
https://www.mandiant.com/resources/blog/tracking-malware-import-hashing
Returns:
the hexdigest of the MD5 hash of the exported symbols.
"""
impstrs = []
exts = ["ocx", "sys", "dll"]
if not hasattr(self, "DIRECTORY_ENTRY_IMPORT"):
return ""
for entry in self.DIRECTORY_ENTRY_IMPORT:
if isinstance(entry.dll, bytes):
libname = entry.dll.decode().lower()
else:
libname = entry.dll.lower()
parts = libname.rsplit(".", 1)
if len(parts) > 1 and parts[1] in exts:
libname = parts[0]
entry_dll_lower = entry.dll.lower()
for imp in entry.imports:
funcname = None
if not imp.name:
funcname = ordlookup.ordLookup(
entry_dll_lower, imp.ordinal, make_name=True
)
if not funcname:
raise PEFormatError(
f"Unable to look up ordinal {entry.dll}:{imp.ordinal:04x}"
)
else:
funcname = imp.name
if not funcname:
continue
if isinstance(funcname, bytes):
funcname = funcname.decode()
impstrs.append("%s.%s" % (libname.lower(), funcname.lower()))
return md5(",".join(impstrs).encode()).hexdigest()
def get_exphash(self):
"""Return the exphash of the PE file.
Similar to imphash, but based on exported symbol names and their specific order.
Returns:
the hexdigest of the SHA256 hash of the exported symbols.
"""
if not hasattr(self, "DIRECTORY_ENTRY_EXPORT"):
return ""
if not hasattr(self.DIRECTORY_ENTRY_EXPORT, "symbols"):
return ""
export_list = [
e.name.decode().lower()
for e in self.DIRECTORY_ENTRY_EXPORT.symbols
if e and e.name is not None
]
if len(export_list) == 0:
return ""
return sha256(",".join(export_list).encode()).hexdigest()
def parse_import_directory(self, rva, size, dllnames_only=False):
"""Walk and parse the import directory."""
import_descs = []
error_count = 0
image_import_descriptor_size = Structure(
self.__IMAGE_IMPORT_DESCRIPTOR_format__
).sizeof()
while True:
try:
# If the RVA is invalid all would blow up. Some EXEs seem to be
# specially nasty and have an invalid RVA.
data = self.get_data(rva, image_import_descriptor_size)
except PEFormatError:
self.__warnings.append(
f"Error parsing the import directory at RVA: 0x{rva:x}"
)
break
file_offset = self.get_offset_from_rva(rva)
import_desc = self.__unpack_data__(
self.__IMAGE_IMPORT_DESCRIPTOR_format__, data, file_offset=file_offset
)
# If the structure is all zeros, we reached the end of the list
if not import_desc or import_desc.all_zeroes():
break
rva += import_desc.sizeof()
# If the array of thunks is somewhere earlier than the import
# descriptor we can set a maximum length for the array. Otherwise
# just set a maximum length of the size of the file
max_len = len(self.__data__) - file_offset
if rva > import_desc.OriginalFirstThunk or rva > import_desc.FirstThunk:
max_len = max(
rva - import_desc.OriginalFirstThunk, rva - import_desc.FirstThunk
)
import_data = []
if not dllnames_only:
try:
import_data = self.parse_imports(
import_desc.OriginalFirstThunk,
import_desc.FirstThunk,
import_desc.ForwarderChain,
max_length=max_len,
)
except PEFormatError as e:
self.__warnings.append(
"Error parsing the import directory. "
f"Invalid Import data at RVA: 0x{rva:x} ({e.value})"
)
if error_count > 5:
self.__warnings.append(
"Too many errors parsing the import directory. "
f"Invalid import data at RVA: 0x{rva:x}"
)
break
if not import_data:
error_count += 1
# TODO: do not continue here
continue
dll = self.get_string_at_rva(import_desc.Name, MAX_DLL_LENGTH)
if not is_valid_dos_filename(dll):
dll = b("*invalid*")
if dll:
for symbol in import_data:
if symbol.name is None:
funcname = ordlookup.ordLookup(dll.lower(), symbol.ordinal)
if funcname:
symbol.name = funcname
import_descs.append(
ImportDescData(struct=import_desc, imports=import_data, dll=dll)
)
if not dllnames_only:
suspicious_imports = set(["LoadLibrary", "GetProcAddress"])
suspicious_imports_count = 0
total_symbols = 0
for imp_dll in import_descs:
for symbol in imp_dll.imports:
for suspicious_symbol in suspicious_imports:
if not symbol or not symbol.name:
continue
name = symbol.name
if type(symbol.name) == bytes:
name = symbol.name.decode("utf-8")
if name.startswith(suspicious_symbol):
suspicious_imports_count += 1
break
total_symbols += 1
if (
suspicious_imports_count == len(suspicious_imports)
and total_symbols < 20
):
self.__warnings.append(
"Imported symbols contain entries typical of packed executables."
)
return import_descs
def parse_imports(
self,
original_first_thunk,
first_thunk,
forwarder_chain,
max_length=None,
contains_addresses=False,
):
"""Parse the imported symbols.
It will fill a list, which will be available as the dictionary
attribute "imports". Its keys will be the DLL names and the values
of all the symbols imported from that object.
"""
imported_symbols = []
# Import Lookup Table. Contains ordinals or pointers to strings.
ilt = self.get_import_table(
original_first_thunk, max_length, contains_addresses
)
# Import Address Table. May have identical content to ILT if
# PE file is not bound. It will contain the address of the
# imported symbols once the binary is loaded or if it is already
# bound.
iat = self.get_import_table(first_thunk, max_length, contains_addresses)
# OC Patch:
# Would crash if IAT or ILT had None type
if (not iat or len(iat) == 0) and (not ilt or len(ilt) == 0):
self.__warnings.append(
"Damaged Import Table information. "
"ILT and/or IAT appear to be broken. "
f"OriginalFirstThunk: 0x{original_first_thunk:x} "
f"FirstThunk: 0x{first_thunk:x}"
)
return []
table = None
if ilt:
table = ilt
elif iat:
table = iat
else:
return None
imp_offset = 4
address_mask = 0x7FFFFFFF
if self.PE_TYPE == OPTIONAL_HEADER_MAGIC_PE:
ordinal_flag = IMAGE_ORDINAL_FLAG
elif self.PE_TYPE == OPTIONAL_HEADER_MAGIC_PE_PLUS:
ordinal_flag = IMAGE_ORDINAL_FLAG64
imp_offset = 8
address_mask = 0x7FFFFFFFFFFFFFFF
else:
# Some PEs may have an invalid value in the Magic field of the
# Optional Header. Just in case the remaining file is parseable
# let's pretend it's a 32bit PE32 by default.
ordinal_flag = IMAGE_ORDINAL_FLAG
num_invalid = 0
for idx, tbl_entry in enumerate(table):
imp_ord = None
imp_hint = None
imp_name = None
name_offset = None
hint_name_table_rva = None
import_by_ordinal = False # declare it here first
if tbl_entry.AddressOfData:
# If imported by ordinal, we will append the ordinal number
#
if tbl_entry.AddressOfData & ordinal_flag:
import_by_ordinal = True
imp_ord = tbl_entry.AddressOfData & 0xFFFF
imp_name = None
name_offset = None
else:
import_by_ordinal = False
try:
hint_name_table_rva = tbl_entry.AddressOfData & address_mask
data = self.get_data(hint_name_table_rva, 2)
# Get the Hint
imp_hint = self.get_word_from_data(data, 0)
imp_name = self.get_string_at_rva(
tbl_entry.AddressOfData + 2, MAX_IMPORT_NAME_LENGTH
)
if not is_valid_function_name(imp_name):
imp_name = b("*invalid*")
name_offset = self.get_offset_from_rva(
tbl_entry.AddressOfData + 2
)
except PEFormatError:
pass
# by nriva: we want the ThunkRVA and ThunkOffset
thunk_offset = tbl_entry.get_file_offset()
thunk_rva = self.get_rva_from_offset(thunk_offset)
imp_address = (
first_thunk + self.OPTIONAL_HEADER.ImageBase + idx * imp_offset
)
struct_iat = None
try:
if iat and ilt and ilt[idx].AddressOfData != iat[idx].AddressOfData:
imp_bound = iat[idx].AddressOfData
struct_iat = iat[idx]
else:
imp_bound = None
except IndexError:
imp_bound = None
# The file with hashes:
#
# MD5: bfe97192e8107d52dd7b4010d12b2924
# SHA256: 3d22f8b001423cb460811ab4f4789f277b35838d45c62ec0454c877e7c82c7f5
#
# has an invalid table built in a way that it's parseable but contains
# invalid entries that lead pefile to take extremely long amounts of time to
# parse. It also leads to extreme memory consumption.
# To prevent similar cases, if invalid entries are found in the middle of a
# table the parsing will be aborted
#
if imp_ord is None and imp_name is None:
raise PEFormatError("Invalid entries, aborting parsing.")
# Some PEs appear to interleave valid and invalid imports. Instead of
# aborting the parsing altogether we will simply skip the invalid entries.
# Although if we see 1000 invalid entries and no legit ones, we abort.
if imp_name == b("*invalid*"):
if num_invalid > 1000 and num_invalid == idx:
raise PEFormatError("Too many invalid names, aborting parsing.")
num_invalid += 1
continue
if imp_ord or imp_name:
imported_symbols.append(
ImportData(
pe=self,
struct_table=tbl_entry,
struct_iat=struct_iat, # for bound imports if any
import_by_ordinal=import_by_ordinal,
ordinal=imp_ord,
ordinal_offset=tbl_entry.get_file_offset(),
hint=imp_hint,
name=imp_name,
name_offset=name_offset,
bound=imp_bound,
address=imp_address,
hint_name_table_rva=hint_name_table_rva,
thunk_offset=thunk_offset,
thunk_rva=thunk_rva,
)
)
return imported_symbols
def get_import_table(self, rva, max_length=None, contains_addresses=False):
table = []
# We need the ordinal flag for a simple heuristic
# we're implementing within the loop
#
if self.PE_TYPE == OPTIONAL_HEADER_MAGIC_PE:
ordinal_flag = IMAGE_ORDINAL_FLAG
format = self.__IMAGE_THUNK_DATA_format__
elif self.PE_TYPE == OPTIONAL_HEADER_MAGIC_PE_PLUS:
ordinal_flag = IMAGE_ORDINAL_FLAG64
format = self.__IMAGE_THUNK_DATA64_format__
else:
# Some PEs may have an invalid value in the Magic field of the
# Optional Header. Just in case the remaining file is parseable
# let's pretend it's a 32bit PE32 by default.
ordinal_flag = IMAGE_ORDINAL_FLAG
format = self.__IMAGE_THUNK_DATA_format__
expected_size = Structure(format).sizeof()
MAX_ADDRESS_SPREAD = 128 * 2**20 # 128 MB
ADDR_4GB = 2**32
MAX_REPEATED_ADDRESSES = 15
repeated_address = 0
addresses_of_data_set_64 = AddressSet()
addresses_of_data_set_32 = AddressSet()
start_rva = rva
while rva:
if max_length is not None and rva >= start_rva + max_length:
self.__warnings.append(
"Error parsing the import table. Entries go beyond bounds."
)
break
# Enforce an upper bounds on import symbols.
if self.__total_import_symbols > MAX_IMPORT_SYMBOLS:
self.__warnings.append(
"Excessive number of imports %d (>%s)"
% (self.__total_import_symbols, MAX_IMPORT_SYMBOLS)
)
break
self.__total_import_symbols += 1
# if we see too many times the same entry we assume it could be
# a table containing bogus data (with malicious intent or otherwise)
if repeated_address >= MAX_REPEATED_ADDRESSES:
return []
# if the addresses point somewhere but the difference between the highest
# and lowest address is larger than MAX_ADDRESS_SPREAD we assume a bogus
# table as the addresses should be contained within a module
if addresses_of_data_set_32.diff() > MAX_ADDRESS_SPREAD:
return []
if addresses_of_data_set_64.diff() > MAX_ADDRESS_SPREAD:
return []
failed = False
try:
data = self.get_data(rva, expected_size)
except PEFormatError:
failed = True
if failed or len(data) != expected_size:
self.__warnings.append(
"Error parsing the import table. " "Invalid data at RVA: 0x%x" % rva
)
return None
thunk_data = self.__unpack_data__(
format, data, file_offset=self.get_offset_from_rva(rva)
)
# If the thunk data contains VAs instead of RVAs, we need to normalize them
if contains_addresses:
thunk_data.AddressOfData = self.normalize_import_va(
thunk_data.AddressOfData
)
thunk_data.ForwarderString = self.normalize_import_va(
thunk_data.ForwarderString
)
thunk_data.Function = self.normalize_import_va(thunk_data.Function)
thunk_data.Ordinal = self.normalize_import_va(thunk_data.Ordinal)
# Check if the AddressOfData lies within the range of RVAs that it's
# being scanned, abort if that is the case, as it is very unlikely
# to be legitimate data.
# Seen in PE with SHA256:
# 5945bb6f0ac879ddf61b1c284f3b8d20c06b228e75ae4f571fa87f5b9512902c
if (
thunk_data
and thunk_data.AddressOfData >= start_rva
and thunk_data.AddressOfData <= rva
):
self.__warnings.append(
"Error parsing the import table. "
"AddressOfData overlaps with THUNK_DATA for "
"THUNK at RVA 0x%x" % (rva)
)
break
if thunk_data and thunk_data.AddressOfData:
addr_of_data = thunk_data.AddressOfData
# If the entry looks like could be an ordinal...
if addr_of_data & ordinal_flag:
# but its value is beyond 2^16, we will assume it's a
# corrupted and ignore it altogether
if addr_of_data & 0x7FFFFFFF > 0xFFFF:
return []
# and if it looks like it should be an RVA
else:
# keep track of the RVAs seen and store them to study their
# properties. When certain non-standard features are detected
# the parsing will be aborted
if addr_of_data >= ADDR_4GB:
the_set = addresses_of_data_set_64
else:
the_set = addresses_of_data_set_32
if addr_of_data in the_set:
repeated_address += 1
the_set.add(addr_of_data)
if not thunk_data or thunk_data.all_zeroes():
break
rva += thunk_data.sizeof()
table.append(thunk_data)
return table
def get_memory_mapped_image(self, max_virtual_address=0x10000000, ImageBase=None):
"""Returns the data corresponding to the memory layout of the PE file.
The data includes the PE header and the sections loaded at offsets
corresponding to their relative virtual addresses. (the VirtualAddress
section header member).
Any offset in this data corresponds to the absolute memory address
ImageBase+offset.
The optional argument 'max_virtual_address' provides with means of limiting
which sections are processed.
Any section with their VirtualAddress beyond this value will be skipped.
Normally, sections with values beyond this range are just there to confuse
tools. It's a common trick to see in packed executables.
If the 'ImageBase' optional argument is supplied, the file's relocations
will be applied to the image by calling the 'relocate_image()' method. Beware
that the relocation information is applied permanently.
"""
# Rebase if requested
#
if ImageBase is not None:
# Keep a copy of the image's data before modifying it by rebasing it
#
original_data = self.__data__
self.relocate_image(ImageBase)
# Collect all sections in one code block
mapped_data = self.__data__[:]
for section in self.sections:
# Miscellaneous integrity tests.
# Some packer will set these to bogus values to make tools go nuts.
if section.Misc_VirtualSize == 0 and section.SizeOfRawData == 0:
continue
srd = section.SizeOfRawData
prd = self.adjust_FileAlignment(
section.PointerToRawData, self.OPTIONAL_HEADER.FileAlignment
)
VirtualAddress_adj = self.adjust_SectionAlignment(
section.VirtualAddress,
self.OPTIONAL_HEADER.SectionAlignment,
self.OPTIONAL_HEADER.FileAlignment,
)
if (
srd > len(self.__data__)
or prd > len(self.__data__)
or srd + prd > len(self.__data__)
or VirtualAddress_adj >= max_virtual_address
):
continue
padding_length = VirtualAddress_adj - len(mapped_data)
if padding_length > 0:
mapped_data += b"\0" * padding_length
elif padding_length < 0:
mapped_data = mapped_data[:padding_length]
mapped_data += section.get_data()
# If the image was rebased, restore it to its original form
#
if ImageBase is not None:
self.__data__ = original_data
return mapped_data
def get_resources_strings(self):
"""Returns a list of all the strings found withing the resources (if any).
This method will scan all entries in the resources directory of the PE, if
there is one, and will return a [] with the strings.
An empty list will be returned otherwise.
"""
resources_strings = []
if hasattr(self, "DIRECTORY_ENTRY_RESOURCE"):
for res_type in self.DIRECTORY_ENTRY_RESOURCE.entries:
if hasattr(res_type, "directory"):
for resource_id in res_type.directory.entries:
if hasattr(resource_id, "directory"):
if (
hasattr(resource_id.directory, "strings")
and resource_id.directory.strings
):
for res_string in list(
resource_id.directory.strings.values()
):
resources_strings.append(res_string)
return resources_strings
def get_data(self, rva=0, length=None):
"""Get data regardless of the section where it lies on.
Given a RVA and the size of the chunk to retrieve, this method
will find the section where the data lies and return the data.
"""
s = self.get_section_by_rva(rva)
if length:
end = rva + length
else:
end = None
if not s:
if rva < len(self.header):
return self.header[rva:end]
# Before we give up we check whether the file might
# contain the data anyway. There are cases of PE files
# without sections that rely on windows loading the first
# 8291 bytes into memory and assume the data will be
# there
# A functional file with these characteristics is:
# MD5: 0008892cdfbc3bda5ce047c565e52295
# SHA-1: c7116b9ff950f86af256defb95b5d4859d4752a9
#
if rva < len(self.__data__):
return self.__data__[rva:end]
raise PEFormatError("data at RVA can't be fetched. Corrupt header?")
return s.get_data(rva, length)
def get_rva_from_offset(self, offset):
"""Get the RVA corresponding to this file offset."""
s = self.get_section_by_offset(offset)
if not s:
if self.sections:
lowest_rva = min(
[
self.adjust_SectionAlignment(
s.VirtualAddress,
self.OPTIONAL_HEADER.SectionAlignment,
self.OPTIONAL_HEADER.FileAlignment,
)
for s in self.sections
]
)
if offset < lowest_rva:
# We will assume that the offset lies within the headers, or
# at least points before where the earliest section starts
# and we will simply return the offset as the RVA
#
# The case illustrating this behavior can be found at:
# http://corkami.blogspot.com/2010/01/hey-hey-hey-whats-in-your-head.html
# where the import table is not contained by any section
# hence the RVA needs to be resolved to a raw offset
return offset
return None
else:
return offset
return s.get_rva_from_offset(offset)
def get_offset_from_rva(self, rva):
"""Get the file offset corresponding to this RVA.
Given a RVA , this method will find the section where the
data lies and return the offset within the file.
"""
s = self.get_section_by_rva(rva)
if not s:
# If not found within a section assume it might
# point to overlay data or otherwise data present
# but not contained in any section. In those
# cases the RVA should equal the offset
if rva < len(self.__data__):
return rva
raise PEFormatError(f"data at RVA 0x{rva:x} can't be fetched")
return s.get_offset_from_rva(rva)
def get_string_at_rva(self, rva, max_length=MAX_STRING_LENGTH):
"""Get an ASCII string located at the given address."""
if rva is None:
return None
s = self.get_section_by_rva(rva)
if not s:
return self.get_string_from_data(0, self.__data__[rva : rva + max_length])
return self.get_string_from_data(0, s.get_data(rva, length=max_length))
def get_bytes_from_data(self, offset, data):
"""."""
if offset > len(data):
return b""
d = data[offset:]
if isinstance(d, bytearray):
return bytes(d)
return d
def get_string_from_data(self, offset, data):
"""Get an ASCII string from data."""
s = self.get_bytes_from_data(offset, data)
end = s.find(b"\0")
if end >= 0:
s = s[:end]
return s
def get_string_u_at_rva(self, rva, max_length=2**16, encoding=None):
"""Get an Unicode string located at the given address."""
if max_length == 0:
return b""
# If the RVA is invalid let the exception reach the callers. All
# call-sites of get_string_u_at_rva() will handle it.
data = self.get_data(rva, 2)
# max_length is the maximum count of 16bit characters needs to be
# doubled to get size in bytes
max_length <<= 1
requested = min(max_length, 256)
data = self.get_data(rva, requested)
# try to find null-termination
null_index = -1
while True:
null_index = data.find(b"\x00\x00", null_index + 1)
if null_index == -1:
data_length = len(data)
if data_length < requested or data_length == max_length:
null_index = len(data) >> 1
break
# Request remaining part of data limited by max_length
data += self.get_data(rva + data_length, max_length - data_length)
null_index = requested - 1
requested = max_length
elif null_index % 2 == 0:
null_index >>= 1
break
# convert selected part of the string to unicode
uchrs = struct.unpack("<{:d}H".format(null_index), data[: null_index * 2])
s = "".join(map(chr, uchrs))
if encoding:
return b(s.encode(encoding, "backslashreplace_"))
return b(s.encode("utf-8", "backslashreplace_"))
def get_section_by_offset(self, offset):
"""Get the section containing the given file offset."""
for section in self.sections:
if section.contains_offset(offset):
return section
return None
def get_section_by_rva(self, rva):
"""Get the section containing the given address."""
# if we look a lot of times at RVA in the same section, "cache" the last used section
# to speedup lookups (very useful when parsing import table)
if self._get_section_by_rva_last_used is not None:
if self._get_section_by_rva_last_used.contains_rva(rva):
return self._get_section_by_rva_last_used
for section in self.sections:
if section.contains_rva(rva):
self._get_section_by_rva_last_used = section
return section
return None
def __str__(self):
return self.dump_info()
def has_relocs(self):
"""Checks if the PE file has relocation directory"""
return hasattr(self, "DIRECTORY_ENTRY_BASERELOC")
def has_dynamic_relocs(self):
if hasattr(self, "DIRECTORY_ENTRY_LOAD_CONFIG"):
if self.DIRECTORY_ENTRY_LOAD_CONFIG.dynamic_relocations:
return True
return False
def print_info(self, encoding="utf-8"):
"""Print all the PE header information in a human readable from."""
print(self.dump_info(encoding=encoding))
def dump_info(self, dump=None, encoding="ascii"):
"""Dump all the PE header information into human readable string."""
if dump is None:
dump = Dump()
warnings = self.get_warnings()
if warnings:
dump.add_header("Parsing Warnings")
for warning in warnings:
dump.add_line(warning)
dump.add_newline()
dump.add_header("DOS_HEADER")
dump.add_lines(self.DOS_HEADER.dump())
dump.add_newline()
dump.add_header("NT_HEADERS")
dump.add_lines(self.NT_HEADERS.dump())
dump.add_newline()
dump.add_header("FILE_HEADER")
dump.add_lines(self.FILE_HEADER.dump())
image_flags = retrieve_flags(IMAGE_CHARACTERISTICS, "IMAGE_FILE_")
dump.add("Flags: ")
flags = []
for flag in sorted(image_flags):
if getattr(self.FILE_HEADER, flag[0]):
flags.append(flag[0])
dump.add_line(", ".join(flags))
dump.add_newline()
if hasattr(self, "OPTIONAL_HEADER") and self.OPTIONAL_HEADER is not None:
dump.add_header("OPTIONAL_HEADER")
dump.add_lines(self.OPTIONAL_HEADER.dump())
dll_characteristics_flags = retrieve_flags(
DLL_CHARACTERISTICS, "IMAGE_DLLCHARACTERISTICS_"
)
dump.add("DllCharacteristics: ")
flags = []
for flag in sorted(dll_characteristics_flags):
if getattr(self.OPTIONAL_HEADER, flag[0]):
flags.append(flag[0])
dump.add_line(", ".join(flags))
dump.add_newline()
dump.add_header("PE Sections")
section_flags = retrieve_flags(SECTION_CHARACTERISTICS, "IMAGE_SCN_")
for section in self.sections:
dump.add_lines(section.dump())
dump.add("Flags: ")
flags = []
for flag in sorted(section_flags):
if getattr(section, flag[0]):
flags.append(flag[0])
dump.add_line(", ".join(flags))
dump.add_line(
"Entropy: {0:f} (Min=0.0, Max=8.0)".format(section.get_entropy())
)
if md5 is not None:
dump.add_line("MD5 hash: {0}".format(section.get_hash_md5()))
if sha1 is not None:
dump.add_line("SHA-1 hash: %s" % section.get_hash_sha1())
if sha256 is not None:
dump.add_line("SHA-256 hash: %s" % section.get_hash_sha256())
if sha512 is not None:
dump.add_line("SHA-512 hash: %s" % section.get_hash_sha512())
dump.add_newline()
if hasattr(self, "OPTIONAL_HEADER") and hasattr(
self.OPTIONAL_HEADER, "DATA_DIRECTORY"
):
dump.add_header("Directories")
for directory in self.OPTIONAL_HEADER.DATA_DIRECTORY:
if directory is not None:
dump.add_lines(directory.dump())
dump.add_newline()
if hasattr(self, "VS_VERSIONINFO"):
for idx, vinfo_entry in enumerate(self.VS_VERSIONINFO):
if len(self.VS_VERSIONINFO) > 1:
dump.add_header(f"Version Information {idx + 1}")
else:
dump.add_header("Version Information")
if vinfo_entry is not None:
dump.add_lines(vinfo_entry.dump())
dump.add_newline()
if hasattr(self, "VS_FIXEDFILEINFO"):
dump.add_lines(self.VS_FIXEDFILEINFO[idx].dump())
dump.add_newline()
if hasattr(self, "FileInfo") and len(self.FileInfo) > idx:
for entry in self.FileInfo[idx]:
dump.add_lines(entry.dump())
dump.add_newline()
if hasattr(entry, "StringTable"):
for st_entry in entry.StringTable:
[dump.add_line(" " + line) for line in st_entry.dump()]
dump.add_line(
" LangID: {0}".format(
st_entry.LangID.decode(
encoding, "backslashreplace_"
)
)
)
dump.add_newline()
for str_entry in sorted(list(st_entry.entries.items())):
# try:
dump.add_line(
" {0}: {1}".format(
str_entry[0].decode(
encoding, "backslashreplace_"
),
str_entry[1].decode(
encoding, "backslashreplace_"
),
)
)
dump.add_newline()
elif hasattr(entry, "Var"):
for var_entry in entry.Var:
if hasattr(var_entry, "entry"):
[
dump.add_line(" " + line)
for line in var_entry.dump()
]
dump.add_line(
" {0}: {1}".format(
list(var_entry.entry.keys())[0].decode(
"utf-8", "backslashreplace_"
),
list(var_entry.entry.values())[0],
)
)
dump.add_newline()
if hasattr(self, "DIRECTORY_ENTRY_EXPORT"):
dump.add_header("Exported symbols")
dump.add_lines(self.DIRECTORY_ENTRY_EXPORT.struct.dump())
dump.add_newline()
dump.add_line("%-10s %-10s %s" % ("Ordinal", "RVA", "Name"))
for export in self.DIRECTORY_ENTRY_EXPORT.symbols:
if export.address is not None:
name = b("None")
if export.name:
name = export.name
dump.add(
"%-10d 0x%08X %s"
% (export.ordinal, export.address, name.decode(encoding))
)
if export.forwarder:
dump.add_line(
" forwarder: {0}".format(
export.forwarder.decode(encoding, "backslashreplace_")
)
)
else:
dump.add_newline()
dump.add_newline()
if hasattr(self, "DIRECTORY_ENTRY_IMPORT"):
dump.add_header("Imported symbols")
for module in self.DIRECTORY_ENTRY_IMPORT:
dump.add_lines(module.struct.dump())
# Print the name of the DLL if there are no imports.
if not module.imports:
dump.add(
" Name -> {0}".format(
self.get_string_at_rva(module.struct.Name).decode(
encoding, "backslashreplace_"
)
)
)
dump.add_newline()
dump.add_newline()
for symbol in module.imports:
if symbol.import_by_ordinal is True:
if symbol.name is not None:
dump.add(
"{0}.{1} Ordinal[{2}] (Imported by Ordinal)".format(
module.dll.decode("utf-8"),
symbol.name.decode("utf-8"),
symbol.ordinal,
)
)
else:
dump.add(
"{0} Ordinal[{1}] (Imported by Ordinal)".format(
module.dll.decode("utf-8"), symbol.ordinal
)
)
else:
dump.add(
"{0}.{1} Hint[{2:d}]".format(
module.dll.decode(encoding, "backslashreplace_"),
symbol.name.decode(encoding, "backslashreplace_"),
symbol.hint,
)
)
if symbol.bound:
dump.add_line(" Bound: 0x{0:08X}".format(symbol.bound))
else:
dump.add_newline()
dump.add_newline()
if hasattr(self, "DIRECTORY_ENTRY_BOUND_IMPORT"):
dump.add_header("Bound imports")
for bound_imp_desc in self.DIRECTORY_ENTRY_BOUND_IMPORT:
dump.add_lines(bound_imp_desc.struct.dump())
dump.add_line(
"DLL: {0}".format(
bound_imp_desc.name.decode(encoding, "backslashreplace_")
)
)
dump.add_newline()
for bound_imp_ref in bound_imp_desc.entries:
dump.add_lines(bound_imp_ref.struct.dump(), 4)
dump.add_line(
"DLL: {0}".format(
bound_imp_ref.name.decode(encoding, "backslashreplace_")
),
4,
)
dump.add_newline()
if hasattr(self, "DIRECTORY_ENTRY_DELAY_IMPORT"):
dump.add_header("Delay Imported symbols")
for module in self.DIRECTORY_ENTRY_DELAY_IMPORT:
dump.add_lines(module.struct.dump())
dump.add_newline()
for symbol in module.imports:
if symbol.import_by_ordinal is True:
dump.add(
"{0} Ordinal[{1:d}] (Imported by Ordinal)".format(
module.dll.decode(encoding, "backslashreplace_"),
symbol.ordinal,
)
)
else:
dump.add(
"{0}.{1} Hint[{2}]".format(
module.dll.decode(encoding, "backslashreplace_"),
symbol.name.decode(encoding, "backslashreplace_"),
symbol.hint,
)
)
if symbol.bound:
dump.add_line(" Bound: 0x{0:08X}".format(symbol.bound))
else:
dump.add_newline()
dump.add_newline()
if hasattr(self, "DIRECTORY_ENTRY_RESOURCE"):
dump.add_header("Resource directory")
dump.add_lines(self.DIRECTORY_ENTRY_RESOURCE.struct.dump())
for res_type in self.DIRECTORY_ENTRY_RESOURCE.entries:
if res_type.name is not None:
name = res_type.name.decode(encoding, "backslashreplace_")
dump.add_line(
f"Name: [{name}]",
2,
)
else:
res_type_id = RESOURCE_TYPE.get(res_type.struct.Id, "-")
dump.add_line(
f"Id: [0x{res_type.struct.Id:X}] ({res_type_id})",
2,
)
dump.add_lines(res_type.struct.dump(), 2)
if hasattr(res_type, "directory"):
dump.add_lines(res_type.directory.struct.dump(), 4)
for resource_id in res_type.directory.entries:
if resource_id.name is not None:
name = resource_id.name.decode("utf-8", "backslashreplace_")
dump.add_line(
f"Name: [{name}]",
6,
)
else:
dump.add_line(f"Id: [0x{resource_id.struct.Id:X}]", 6)
dump.add_lines(resource_id.struct.dump(), 6)
if hasattr(resource_id, "directory"):
dump.add_lines(resource_id.directory.struct.dump(), 8)
for resource_lang in resource_id.directory.entries:
if hasattr(resource_lang, "data"):
dump.add_line(
"\\--- LANG [%d,%d][%s,%s]"
% (
resource_lang.data.lang,
resource_lang.data.sublang,
LANG.get(
resource_lang.data.lang, "*unknown*"
),
get_sublang_name_for_lang(
resource_lang.data.lang,
resource_lang.data.sublang,
),
),
8,
)
dump.add_lines(resource_lang.struct.dump(), 10)
dump.add_lines(resource_lang.data.struct.dump(), 12)
if (
hasattr(resource_id.directory, "strings")
and resource_id.directory.strings
):
dump.add_line("[STRINGS]", 10)
for idx, res_string in list(
sorted(resource_id.directory.strings.items())
):
dump.add_line(
"{0:6d}: {1}".format(
idx,
res_string.encode(
"unicode-escape", "backslashreplace"
).decode("ascii"),
),
12,
)
dump.add_newline()
dump.add_newline()
if (
hasattr(self, "DIRECTORY_ENTRY_TLS")
and self.DIRECTORY_ENTRY_TLS
and self.DIRECTORY_ENTRY_TLS.struct
):
dump.add_header("TLS")
dump.add_lines(self.DIRECTORY_ENTRY_TLS.struct.dump())
dump.add_newline()
if (
hasattr(self, "DIRECTORY_ENTRY_LOAD_CONFIG")
and self.DIRECTORY_ENTRY_LOAD_CONFIG
and self.DIRECTORY_ENTRY_LOAD_CONFIG.struct
):
dump.add_header("LOAD_CONFIG")
dump.add_lines(self.DIRECTORY_ENTRY_LOAD_CONFIG.struct.dump())
dump.add_newline()
if hasattr(self, "DIRECTORY_ENTRY_DEBUG"):
dump.add_header("Debug information")
for dbg in self.DIRECTORY_ENTRY_DEBUG:
dump.add_lines(dbg.struct.dump())
try:
dump.add_line("Type: " + DEBUG_TYPE[dbg.struct.Type])
except KeyError:
dump.add_line("Type: 0x{0:x}(Unknown)".format(dbg.struct.Type))
dump.add_newline()
if dbg.entry:
dump.add_lines(dbg.entry.dump(), 4)
dump.add_newline()
if self.has_relocs():
dump.add_header("Base relocations")
for base_reloc in self.DIRECTORY_ENTRY_BASERELOC:
dump.add_lines(base_reloc.struct.dump())
for reloc in base_reloc.entries:
try:
dump.add_line(
"%08Xh %s" % (reloc.rva, RELOCATION_TYPE[reloc.type][16:]),
4,
)
except KeyError:
dump.add_line(
"0x%08X 0x%x(Unknown)" % (reloc.rva, reloc.type), 4
)
dump.add_newline()
if (
hasattr(self, "DIRECTORY_ENTRY_EXCEPTION")
and len(self.DIRECTORY_ENTRY_EXCEPTION) > 0
):
dump.add_header("Unwind data for exception handling")
for rf in self.DIRECTORY_ENTRY_EXCEPTION:
dump.add_lines(rf.struct.dump())
if hasattr(rf, "unwindinfo") and rf.unwindinfo is not None:
dump.add_lines(rf.unwindinfo.dump(), 4)
return dump.get_text()
def dump_dict(self):
"""Dump all the PE header information into a dictionary."""
dump_dict = {}
warnings = self.get_warnings()
if warnings:
dump_dict["Parsing Warnings"] = warnings
dump_dict["DOS_HEADER"] = self.DOS_HEADER.dump_dict()
dump_dict["NT_HEADERS"] = self.NT_HEADERS.dump_dict()
dump_dict["FILE_HEADER"] = self.FILE_HEADER.dump_dict()
image_flags = retrieve_flags(IMAGE_CHARACTERISTICS, "IMAGE_FILE_")
dump_dict["Flags"] = []
for flag in image_flags:
if getattr(self.FILE_HEADER, flag[0]):
dump_dict["Flags"].append(flag[0])
if hasattr(self, "OPTIONAL_HEADER") and self.OPTIONAL_HEADER is not None:
dump_dict["OPTIONAL_HEADER"] = self.OPTIONAL_HEADER.dump_dict()
dll_characteristics_flags = retrieve_flags(
DLL_CHARACTERISTICS, "IMAGE_DLLCHARACTERISTICS_"
)
dump_dict["DllCharacteristics"] = []
for flag in dll_characteristics_flags:
if getattr(self.OPTIONAL_HEADER, flag[0]):
dump_dict["DllCharacteristics"].append(flag[0])
dump_dict["PE Sections"] = []
section_flags = retrieve_flags(SECTION_CHARACTERISTICS, "IMAGE_SCN_")
for section in self.sections:
section_dict = section.dump_dict()
dump_dict["PE Sections"].append(section_dict)
section_dict["Flags"] = []
for flag in section_flags:
if getattr(section, flag[0]):
section_dict["Flags"].append(flag[0])
section_dict["Entropy"] = section.get_entropy()
if md5 is not None:
section_dict["MD5"] = section.get_hash_md5()
if sha1 is not None:
section_dict["SHA1"] = section.get_hash_sha1()
if sha256 is not None:
section_dict["SHA256"] = section.get_hash_sha256()
if sha512 is not None:
section_dict["SHA512"] = section.get_hash_sha512()
if hasattr(self, "OPTIONAL_HEADER") and hasattr(
self.OPTIONAL_HEADER, "DATA_DIRECTORY"
):
dump_dict["Directories"] = []
for idx, directory in enumerate(self.OPTIONAL_HEADER.DATA_DIRECTORY):
if directory is not None:
dump_dict["Directories"].append(directory.dump_dict())
if hasattr(self, "VS_VERSIONINFO"):
dump_dict["Version Information"] = []
for idx, vs_vinfo in enumerate(self.VS_VERSIONINFO):
version_info_list = []
version_info_list.append(vs_vinfo.dump_dict())
if hasattr(self, "VS_FIXEDFILEINFO"):
version_info_list.append(self.VS_FIXEDFILEINFO[idx].dump_dict())
if hasattr(self, "FileInfo") and len(self.FileInfo) > idx:
fileinfo_list = []
version_info_list.append(fileinfo_list)
for entry in self.FileInfo[idx]:
fileinfo_list.append(entry.dump_dict())
if hasattr(entry, "StringTable"):
stringtable_dict = {}
for st_entry in entry.StringTable:
fileinfo_list.extend(st_entry.dump_dict())
stringtable_dict["LangID"] = st_entry.LangID
for str_entry in list(st_entry.entries.items()):
stringtable_dict[str_entry[0]] = str_entry[1]
fileinfo_list.append(stringtable_dict)
elif hasattr(entry, "Var"):
for var_entry in entry.Var:
var_dict = {}
if hasattr(var_entry, "entry"):
fileinfo_list.extend(var_entry.dump_dict())
var_dict[list(var_entry.entry.keys())[0]] = list(
var_entry.entry.values()
)[0]
fileinfo_list.append(var_dict)
dump_dict["Version Information"].append(version_info_list)
if hasattr(self, "DIRECTORY_ENTRY_EXPORT"):
dump_dict["Exported symbols"] = []
dump_dict["Exported symbols"].append(
self.DIRECTORY_ENTRY_EXPORT.struct.dump_dict()
)
for export in self.DIRECTORY_ENTRY_EXPORT.symbols:
export_dict = {}
if export.address is not None:
export_dict.update(
{
"Ordinal": export.ordinal,
"RVA": export.address,
"Name": export.name,
}
)
if export.forwarder:
export_dict["forwarder"] = export.forwarder
dump_dict["Exported symbols"].append(export_dict)
if hasattr(self, "DIRECTORY_ENTRY_IMPORT"):
dump_dict["Imported symbols"] = []
for module in self.DIRECTORY_ENTRY_IMPORT:
import_list = []
dump_dict["Imported symbols"].append(import_list)
import_list.append(module.struct.dump_dict())
for symbol in module.imports:
symbol_dict = {}
if symbol.import_by_ordinal is True:
symbol_dict["DLL"] = module.dll
symbol_dict["Ordinal"] = symbol.ordinal
else:
symbol_dict["DLL"] = module.dll
symbol_dict["Name"] = symbol.name
symbol_dict["Hint"] = symbol.hint
if symbol.bound:
symbol_dict["Bound"] = symbol.bound
import_list.append(symbol_dict)
if hasattr(self, "DIRECTORY_ENTRY_BOUND_IMPORT"):
dump_dict["Bound imports"] = []
for bound_imp_desc in self.DIRECTORY_ENTRY_BOUND_IMPORT:
bound_imp_desc_dict = {}
dump_dict["Bound imports"].append(bound_imp_desc_dict)
bound_imp_desc_dict.update(bound_imp_desc.struct.dump_dict())
bound_imp_desc_dict["DLL"] = bound_imp_desc.name
for bound_imp_ref in bound_imp_desc.entries:
bound_imp_ref_dict = {}
bound_imp_ref_dict.update(bound_imp_ref.struct.dump_dict())
bound_imp_ref_dict["DLL"] = bound_imp_ref.name
if hasattr(self, "DIRECTORY_ENTRY_DELAY_IMPORT"):
dump_dict["Delay Imported symbols"] = []
for module in self.DIRECTORY_ENTRY_DELAY_IMPORT:
module_list = []
dump_dict["Delay Imported symbols"].append(module_list)
module_list.append(module.struct.dump_dict())
for symbol in module.imports:
symbol_dict = {}
if symbol.import_by_ordinal is True:
symbol_dict["DLL"] = module.dll
symbol_dict["Ordinal"] = symbol.ordinal
else:
symbol_dict["DLL"] = module.dll
symbol_dict["Name"] = symbol.name
symbol_dict["Hint"] = symbol.hint
if symbol.bound:
symbol_dict["Bound"] = symbol.bound
module_list.append(symbol_dict)
if hasattr(self, "DIRECTORY_ENTRY_RESOURCE"):
dump_dict["Resource directory"] = []
dump_dict["Resource directory"].append(
self.DIRECTORY_ENTRY_RESOURCE.struct.dump_dict()
)
for res_type in self.DIRECTORY_ENTRY_RESOURCE.entries:
resource_type_dict = {}
if res_type.name is not None:
resource_type_dict["Name"] = res_type.name
else:
resource_type_dict["Id"] = (
res_type.struct.Id,
RESOURCE_TYPE.get(res_type.struct.Id, "-"),
)
resource_type_dict.update(res_type.struct.dump_dict())
dump_dict["Resource directory"].append(resource_type_dict)
if hasattr(res_type, "directory"):
directory_list = []
directory_list.append(res_type.directory.struct.dump_dict())
dump_dict["Resource directory"].append(directory_list)
for resource_id in res_type.directory.entries:
resource_id_dict = {}
if resource_id.name is not None:
resource_id_dict["Name"] = resource_id.name
else:
resource_id_dict["Id"] = resource_id.struct.Id
resource_id_dict.update(resource_id.struct.dump_dict())
directory_list.append(resource_id_dict)
if hasattr(resource_id, "directory"):
resource_id_list = []
resource_id_list.append(
resource_id.directory.struct.dump_dict()
)
directory_list.append(resource_id_list)
for resource_lang in resource_id.directory.entries:
if hasattr(resource_lang, "data"):
resource_lang_dict = {}
resource_lang_dict["LANG"] = resource_lang.data.lang
resource_lang_dict[
"SUBLANG"
] = resource_lang.data.sublang
resource_lang_dict["LANG_NAME"] = LANG.get(
resource_lang.data.lang, "*unknown*"
)
resource_lang_dict[
"SUBLANG_NAME"
] = get_sublang_name_for_lang(
resource_lang.data.lang,
resource_lang.data.sublang,
)
resource_lang_dict.update(
resource_lang.struct.dump_dict()
)
resource_lang_dict.update(
resource_lang.data.struct.dump_dict()
)
resource_id_list.append(resource_lang_dict)
if (
hasattr(resource_id.directory, "strings")
and resource_id.directory.strings
):
for idx, res_string in list(
resource_id.directory.strings.items()
):
resource_id_list.append(
res_string.encode(
"unicode-escape", "backslashreplace"
).decode("ascii")
)
if (
hasattr(self, "DIRECTORY_ENTRY_TLS")
and self.DIRECTORY_ENTRY_TLS
and self.DIRECTORY_ENTRY_TLS.struct
):
dump_dict["TLS"] = self.DIRECTORY_ENTRY_TLS.struct.dump_dict()
if (
hasattr(self, "DIRECTORY_ENTRY_LOAD_CONFIG")
and self.DIRECTORY_ENTRY_LOAD_CONFIG
and self.DIRECTORY_ENTRY_LOAD_CONFIG.struct
):
dump_dict[
"LOAD_CONFIG"
] = self.DIRECTORY_ENTRY_LOAD_CONFIG.struct.dump_dict()
if hasattr(self, "DIRECTORY_ENTRY_DEBUG"):
dump_dict["Debug information"] = []
for dbg in self.DIRECTORY_ENTRY_DEBUG:
dbg_dict = {}
dump_dict["Debug information"].append(dbg_dict)
dbg_dict.update(dbg.struct.dump_dict())
dbg_dict["Type"] = DEBUG_TYPE.get(dbg.struct.Type, dbg.struct.Type)
if self.has_relocs():
dump_dict["Base relocations"] = []
for base_reloc in self.DIRECTORY_ENTRY_BASERELOC:
base_reloc_list = []
dump_dict["Base relocations"].append(base_reloc_list)
base_reloc_list.append(base_reloc.struct.dump_dict())
for reloc in base_reloc.entries:
reloc_dict = {}
base_reloc_list.append(reloc_dict)
reloc_dict["RVA"] = reloc.rva
try:
reloc_dict["Type"] = RELOCATION_TYPE[reloc.type][16:]
except KeyError:
reloc_dict["Type"] = reloc.type
return dump_dict
# OC Patch
def get_physical_by_rva(self, rva):
"""Gets the physical address in the PE file from an RVA value."""
try:
return self.get_offset_from_rva(rva)
except Exception:
return None
##
# Double-Word get / set
##
def get_data_from_dword(self, dword):
"""Return a four byte string representing the double word value (little endian)."""
return struct.pack("<L", dword & 0xFFFFFFFF)
def get_dword_from_data(self, data, offset):
"""Convert four bytes of data to a double word (little endian)
'offset' is assumed to index into a dword array. So setting it to
N will return a dword out of the data starting at offset N*4.
Returns None if the data can't be turned into a double word.
"""
if (offset + 1) * 4 > len(data):
return None
return struct.unpack("<I", data[offset * 4 : (offset + 1) * 4])[0]
def get_dword_at_rva(self, rva):
"""Return the double word value at the given RVA.
Returns None if the value can't be read, i.e. the RVA can't be mapped
to a file offset.
"""
try:
return self.get_dword_from_data(self.get_data(rva, 4), 0)
except PEFormatError:
return None
def get_dword_from_offset(self, offset):
"""Return the double word value at the given file offset. (little endian)"""
if offset + 4 > len(self.__data__):
return None
return self.get_dword_from_data(self.__data__[offset : offset + 4], 0)
def set_dword_at_rva(self, rva, dword):
"""Set the double word value at the file offset corresponding to the given RVA."""
return self.set_bytes_at_rva(rva, self.get_data_from_dword(dword))
def set_dword_at_offset(self, offset, dword):
"""Set the double word value at the given file offset."""
return self.set_bytes_at_offset(offset, self.get_data_from_dword(dword))
##
# Word get / set
##
def get_data_from_word(self, word):
"""Return a two byte string representing the word value. (little endian)."""
return struct.pack("<H", word)
def get_word_from_data(self, data, offset):
"""Convert two bytes of data to a word (little endian)
'offset' is assumed to index into a word array. So setting it to
N will return a dword out of the data starting at offset N*2.
Returns None if the data can't be turned into a word.
"""
if (offset + 1) * 2 > len(data):
return None
return struct.unpack("<H", data[offset * 2 : (offset + 1) * 2])[0]
def get_word_at_rva(self, rva):
"""Return the word value at the given RVA.
Returns None if the value can't be read, i.e. the RVA can't be mapped
to a file offset.
"""
try:
return self.get_word_from_data(self.get_data(rva)[:2], 0)
except PEFormatError:
return None
def get_word_from_offset(self, offset):
"""Return the word value at the given file offset. (little endian)"""
if offset + 2 > len(self.__data__):
return None
return self.get_word_from_data(self.__data__[offset : offset + 2], 0)
def set_word_at_rva(self, rva, word):
"""Set the word value at the file offset corresponding to the given RVA."""
return self.set_bytes_at_rva(rva, self.get_data_from_word(word))
def set_word_at_offset(self, offset, word):
"""Set the word value at the given file offset."""
return self.set_bytes_at_offset(offset, self.get_data_from_word(word))
##
# Quad-Word get / set
##
def get_data_from_qword(self, word):
"""Return an eight byte string representing the quad-word value (little endian)."""
return struct.pack("<Q", word)
def get_qword_from_data(self, data, offset):
"""Convert eight bytes of data to a word (little endian)
'offset' is assumed to index into a word array. So setting it to
N will return a dword out of the data starting at offset N*8.
Returns None if the data can't be turned into a quad word.
"""
if (offset + 1) * 8 > len(data):
return None
return struct.unpack("<Q", data[offset * 8 : (offset + 1) * 8])[0]
def get_qword_at_rva(self, rva):
"""Return the quad-word value at the given RVA.
Returns None if the value can't be read, i.e. the RVA can't be mapped
to a file offset.
"""
try:
return self.get_qword_from_data(self.get_data(rva)[:8], 0)
except PEFormatError:
return None
def get_qword_from_offset(self, offset):
"""Return the quad-word value at the given file offset. (little endian)"""
if offset + 8 > len(self.__data__):
return None
return self.get_qword_from_data(self.__data__[offset : offset + 8], 0)
def set_qword_at_rva(self, rva, qword):
"""Set the quad-word value at the file offset corresponding to the given RVA."""
return self.set_bytes_at_rva(rva, self.get_data_from_qword(qword))
def set_qword_at_offset(self, offset, qword):
"""Set the quad-word value at the given file offset."""
return self.set_bytes_at_offset(offset, self.get_data_from_qword(qword))
##
# Set bytes
##
def set_bytes_at_rva(self, rva, data):
"""Overwrite, with the given string, the bytes at the file offset corresponding
to the given RVA.
Return True if successful, False otherwise. It can fail if the
offset is outside the file's boundaries.
"""
if not isinstance(data, bytes):
raise TypeError("data should be of type: bytes")
offset = self.get_physical_by_rva(rva)
if not offset:
return False
return self.set_bytes_at_offset(offset, data)
def set_bytes_at_offset(self, offset, data):
"""Overwrite the bytes at the given file offset with the given string.
Return True if successful, False otherwise. It can fail if the
offset is outside the file's boundaries.
"""
if not isinstance(data, bytes):
raise TypeError("data should be of type: bytes")
if 0 <= offset < len(self.__data__):
self.set_data_bytes(offset, data)
else:
return False
return True
def set_data_bytes(self, offset: int, data: bytes):
if not isinstance(self.__data__, bytearray):
self.__data__ = bytearray(self.__data__)
self.__data__[offset : offset + len(data)] = data
def merge_modified_section_data(self):
"""Update the PE image content with any individual section data that has been
modified.
"""
for section in self.sections:
section_data_start = self.adjust_FileAlignment(
section.PointerToRawData, self.OPTIONAL_HEADER.FileAlignment
)
section_data_end = section_data_start + section.SizeOfRawData
if section_data_start < len(self.__data__) and section_data_end < len(
self.__data__
):
self.set_data_bytes(section_data_start, section.get_data())
def relocate_image(self, new_ImageBase):
"""Apply the relocation information to the image using the provided image base.
This method will apply the relocation information to the image. Given the new
base, all the relocations will be processed and both the raw data and the
section's data will be fixed accordingly.
The resulting image can be retrieved as well through the method:
get_memory_mapped_image()
In order to get something that would more closely match what could be found in
memory once the Windows loader finished its work.
"""
relocation_difference = new_ImageBase - self.OPTIONAL_HEADER.ImageBase
if (
len(self.OPTIONAL_HEADER.DATA_DIRECTORY) >= 6
and self.OPTIONAL_HEADER.DATA_DIRECTORY[5].Size
):
if not hasattr(self, "DIRECTORY_ENTRY_BASERELOC"):
self.parse_data_directories(
directories=[DIRECTORY_ENTRY["IMAGE_DIRECTORY_ENTRY_BASERELOC"]]
)
if not hasattr(self, "DIRECTORY_ENTRY_BASERELOC"):
self.__warnings.append(
"Relocating image but PE does not have (or pefile cannot "
"parse) a DIRECTORY_ENTRY_BASERELOC"
)
else:
for reloc in self.DIRECTORY_ENTRY_BASERELOC:
# We iterate with an index because if the relocation is of type
# IMAGE_REL_BASED_HIGHADJ we need to also process the next entry
# at once and skip it for the next iteration
#
entry_idx = 0
while entry_idx < len(reloc.entries):
entry = reloc.entries[entry_idx]
entry_idx += 1
if entry.type == RELOCATION_TYPE["IMAGE_REL_BASED_ABSOLUTE"]:
# Nothing to do for this type of relocation
pass
elif entry.type == RELOCATION_TYPE["IMAGE_REL_BASED_HIGH"]:
# Fix the high 16-bits of a relocation
#
# Add high 16-bits of relocation_difference to the
# 16-bit value at RVA=entry.rva
self.set_word_at_rva(
entry.rva,
(
self.get_word_at_rva(entry.rva)
+ relocation_difference
>> 16
)
& 0xFFFF,
)
elif entry.type == RELOCATION_TYPE["IMAGE_REL_BASED_LOW"]:
# Fix the low 16-bits of a relocation
#
# Add low 16 bits of relocation_difference to the 16-bit
# value at RVA=entry.rva
self.set_word_at_rva(
entry.rva,
(
self.get_word_at_rva(entry.rva)
+ relocation_difference
)
& 0xFFFF,
)
elif entry.type == RELOCATION_TYPE["IMAGE_REL_BASED_HIGHLOW"]:
# Handle all high and low parts of a 32-bit relocation
#
# Add relocation_difference to the value at RVA=entry.rva
self.set_dword_at_rva(
entry.rva,
self.get_dword_at_rva(entry.rva)
+ relocation_difference,
)
elif entry.type == RELOCATION_TYPE["IMAGE_REL_BASED_HIGHADJ"]:
# Fix the high 16-bits of a relocation and adjust
#
# Add high 16-bits of relocation_difference to the 32-bit
# value composed from the (16-bit value at
# RVA=entry.rva)<<16 plus the 16-bit value at the next
# relocation entry.
# If the next entry is beyond the array's limits,
# abort... the table is corrupt
if entry_idx == len(reloc.entries):
break
next_entry = reloc.entries[entry_idx]
entry_idx += 1
self.set_word_at_rva(
entry.rva,
(
(self.get_word_at_rva(entry.rva) << 16)
+ next_entry.rva
+ relocation_difference
& 0xFFFF0000
)
>> 16,
)
elif entry.type == RELOCATION_TYPE["IMAGE_REL_BASED_DIR64"]:
# Apply the difference to the 64-bit value at the offset
# RVA=entry.rva
self.set_qword_at_rva(
entry.rva,
self.get_qword_at_rva(entry.rva)
+ relocation_difference,
)
self.OPTIONAL_HEADER.ImageBase = new_ImageBase
# correct VAs(virtual addresses) occurrences in directory information
if hasattr(self, "DIRECTORY_ENTRY_IMPORT"):
for dll in self.DIRECTORY_ENTRY_IMPORT:
for func in dll.imports:
func.address += relocation_difference
if hasattr(self, "DIRECTORY_ENTRY_TLS"):
self.DIRECTORY_ENTRY_TLS.struct.StartAddressOfRawData += (
relocation_difference
)
self.DIRECTORY_ENTRY_TLS.struct.EndAddressOfRawData += (
relocation_difference
)
self.DIRECTORY_ENTRY_TLS.struct.AddressOfIndex += relocation_difference
self.DIRECTORY_ENTRY_TLS.struct.AddressOfCallBacks += (
relocation_difference
)
if hasattr(self, "DIRECTORY_ENTRY_LOAD_CONFIG"):
load_config = self.DIRECTORY_ENTRY_LOAD_CONFIG.struct
if (
hasattr(load_config, "LockPrefixTable")
and load_config.LockPrefixTable
):
load_config.LockPrefixTable += relocation_difference
if hasattr(load_config, "EditList") and load_config.EditList:
load_config.EditList += relocation_difference
if (
hasattr(load_config, "SecurityCookie")
and load_config.SecurityCookie
):
load_config.SecurityCookie += relocation_difference
if (
hasattr(load_config, "SEHandlerTable")
and load_config.SEHandlerTable
):
load_config.SEHandlerTable += relocation_difference
if (
hasattr(load_config, "GuardCFCheckFunctionPointer")
and load_config.GuardCFCheckFunctionPointer
):
load_config.GuardCFCheckFunctionPointer += relocation_difference
if (
hasattr(load_config, "GuardCFDispatchFunctionPointer")
and load_config.GuardCFDispatchFunctionPointer
):
load_config.GuardCFDispatchFunctionPointer += relocation_difference
if (
hasattr(load_config, "GuardCFFunctionTable")
and load_config.GuardCFFunctionTable
):
load_config.GuardCFFunctionTable += relocation_difference
if (
hasattr(load_config, "GuardAddressTakenIatEntryTable")
and load_config.GuardAddressTakenIatEntryTable
):
load_config.GuardAddressTakenIatEntryTable += relocation_difference
if (
hasattr(load_config, "GuardLongJumpTargetTable")
and load_config.GuardLongJumpTargetTable
):
load_config.GuardLongJumpTargetTable += relocation_difference
if (
hasattr(load_config, "DynamicValueRelocTable")
and load_config.DynamicValueRelocTable
):
load_config.DynamicValueRelocTable += relocation_difference
if (
self.PE_TYPE == OPTIONAL_HEADER_MAGIC_PE_PLUS
and hasattr(load_config, "CHPEMetadataPointer")
and load_config.CHPEMetadataPointer
):
load_config.CHPEMetadataPointer += relocation_difference
if (
hasattr(load_config, "GuardRFFailureRoutine")
and load_config.GuardRFFailureRoutine
):
load_config.GuardRFFailureRoutine += relocation_difference
if (
hasattr(load_config, "GuardRFFailureRoutineFunctionPointer")
and load_config.GuardRFFailureRoutineFunctionPointer
):
load_config.GuardRFVerifyStackPointerFunctionPointer += (
relocation_difference
)
if (
hasattr(load_config, "GuardRFVerifyStackPointerFunctionPointer")
and load_config.GuardRFVerifyStackPointerFunctionPointer
):
load_config.GuardRFVerifyStackPointerFunctionPointer += (
relocation_difference
)
if (
hasattr(load_config, "EnclaveConfigurationPointer")
and load_config.EnclaveConfigurationPointer
):
load_config.EnclaveConfigurationPointer += relocation_difference
def verify_checksum(self):
return self.OPTIONAL_HEADER.CheckSum == self.generate_checksum()
def generate_checksum(self):
# This will make sure that the data representing the PE image
# is updated with any changes that might have been made by
# assigning values to header fields as those are not automatically
# updated upon assignment.
#
# data = self.write()
# print('{0}'.format(len(data)))
# for idx, b in enumerate(data):
# if b != ord(self.__data__[idx]) or (idx > 1244440 and idx < 1244460):
# print('Idx: {0} G {1:02x} {3} B {2:02x}'.format(
# idx, ord(self.__data__[idx]), b,
# self.__data__[idx], chr(b)))
self.__data__ = self.write()
# Get the offset to the CheckSum field in the OptionalHeader
# (The offset is the same in PE32 and PE32+)
checksum_offset = self.OPTIONAL_HEADER.get_file_offset() + 0x40 # 64
checksum = 0
# Verify the data is dword-aligned. Add padding if needed
#
remainder = len(self.__data__) % 4
data_len = len(self.__data__) + ((4 - remainder) * (remainder != 0))
for i in range(int(data_len / 4)):
# Skip the checksum field
if i == int(checksum_offset / 4):
continue
if i + 1 == (int(data_len / 4)) and remainder:
dword = struct.unpack(
"I", self.__data__[i * 4 :] + (b"\0" * (4 - remainder))
)[0]
else:
dword = struct.unpack("I", self.__data__[i * 4 : i * 4 + 4])[0]
# Optimized the calculation (thanks to Emmanuel Bourg for pointing it out!)
checksum += dword
if checksum >= 2**32:
checksum = (checksum & 0xFFFFFFFF) + (checksum >> 32)
checksum = (checksum & 0xFFFF) + (checksum >> 16)
checksum = (checksum) + (checksum >> 16)
checksum = checksum & 0xFFFF
# The length is the one of the original data, not the padded one
#
return checksum + len(self.__data__)
def is_exe(self):
"""Check whether the file is a standard executable.
This will return true only if the file has the IMAGE_FILE_EXECUTABLE_IMAGE flag
set and the IMAGE_FILE_DLL not set and the file does not appear to be a driver
either.
"""
EXE_flag = IMAGE_CHARACTERISTICS["IMAGE_FILE_EXECUTABLE_IMAGE"]
if (
(not self.is_dll())
and (not self.is_driver())
and (EXE_flag & self.FILE_HEADER.Characteristics) == EXE_flag
):
return True
return False
def is_dll(self):
"""Check whether the file is a standard DLL.
This will return true only if the image has the IMAGE_FILE_DLL flag set.
"""
DLL_flag = IMAGE_CHARACTERISTICS["IMAGE_FILE_DLL"]
if (DLL_flag & self.FILE_HEADER.Characteristics) == DLL_flag:
return True
return False
def is_driver(self):
"""Check whether the file is a Windows driver.
This will return true only if there are reliable indicators of the image
being a driver.
"""
# Checking that the ImageBase field of the OptionalHeader is above or
# equal to 0x80000000 (that is, whether it lies in the upper 2GB of
# the address space, normally belonging to the kernel) is not a
# reliable enough indicator. For instance, PEs that play the invalid
# ImageBase trick to get relocated could be incorrectly assumed to be
# drivers.
# This is not reliable either...
#
# if any((section.Characteristics &
# SECTION_CHARACTERISTICS['IMAGE_SCN_MEM_NOT_PAGED']) for
# section in self.sections ):
# return True
# If the import directory was not parsed (fast_load = True); do it now.
if not hasattr(self, "DIRECTORY_ENTRY_IMPORT"):
self.parse_data_directories(
directories=[DIRECTORY_ENTRY["IMAGE_DIRECTORY_ENTRY_IMPORT"]]
)
# If there's still no import directory (the PE doesn't have one or it's
# malformed), give up.
if not hasattr(self, "DIRECTORY_ENTRY_IMPORT"):
return False
# self.DIRECTORY_ENTRY_IMPORT will now exist, although it may be empty.
# If it imports from "ntoskrnl.exe" or other kernel components it should
# be a driver
#
system_DLLs = set(
(b"ntoskrnl.exe", b"hal.dll", b"ndis.sys", b"bootvid.dll", b"kdcom.dll")
)
if system_DLLs.intersection(
[imp.dll.lower() for imp in self.DIRECTORY_ENTRY_IMPORT]
):
return True
driver_like_section_names = set((b"page", b"paged"))
if driver_like_section_names.intersection(
[section.Name.lower().rstrip(b"\x00") for section in self.sections]
) and (
self.OPTIONAL_HEADER.Subsystem
in (
SUBSYSTEM_TYPE["IMAGE_SUBSYSTEM_NATIVE"],
SUBSYSTEM_TYPE["IMAGE_SUBSYSTEM_NATIVE_WINDOWS"],
)
):
return True
return False
def get_overlay_data_start_offset(self):
"""Get the offset of data appended to the file and not contained within
the area described in the headers."""
largest_offset_and_size = (0, 0)
def update_if_sum_is_larger_and_within_file(
offset_and_size, file_size=len(self.__data__)
):
if sum(offset_and_size) <= file_size and sum(offset_and_size) > sum(
largest_offset_and_size
):
return offset_and_size
return largest_offset_and_size
if hasattr(self, "OPTIONAL_HEADER"):
largest_offset_and_size = update_if_sum_is_larger_and_within_file(
(
self.OPTIONAL_HEADER.get_file_offset(),
self.FILE_HEADER.SizeOfOptionalHeader,
)
)
for section in self.sections:
largest_offset_and_size = update_if_sum_is_larger_and_within_file(
(section.PointerToRawData, section.SizeOfRawData)
)
skip_directories = [DIRECTORY_ENTRY["IMAGE_DIRECTORY_ENTRY_SECURITY"]]
for idx, directory in enumerate(self.OPTIONAL_HEADER.DATA_DIRECTORY):
if idx in skip_directories:
continue
try:
largest_offset_and_size = update_if_sum_is_larger_and_within_file(
(self.get_offset_from_rva(directory.VirtualAddress), directory.Size)
)
# Ignore directories with RVA out of file
except PEFormatError:
continue
if len(self.__data__) > sum(largest_offset_and_size):
return sum(largest_offset_and_size)
return None
def get_overlay(self):
"""Get the data appended to the file and not contained within the area described
in the headers."""
overlay_data_offset = self.get_overlay_data_start_offset()
if overlay_data_offset is not None:
return self.__data__[overlay_data_offset:]
return None
def trim(self):
"""Return the just data defined by the PE headers, removing any overlaid data."""
overlay_data_offset = self.get_overlay_data_start_offset()
if overlay_data_offset is not None:
return self.__data__[:overlay_data_offset]
return self.__data__[:]
# According to http://corkami.blogspot.com/2010/01/parce-que-la-planche-aura-brule.html
# if PointerToRawData is less that 0x200 it's rounded to zero. Loading the test file
# in a debugger it's easy to verify that the PointerToRawData value of 1 is rounded
# to zero. Hence we reproduce the behavior
#
# According to the document:
# [ Microsoft Portable Executable and Common Object File Format Specification ]
# "The alignment factor (in bytes) that is used to align the raw data of sections in
# the image file. The value should be a power of 2 between 512 and 64 K, inclusive.
# The default is 512. If the SectionAlignment is less than the architecture's page
# size, then FileAlignment must match SectionAlignment."
#
# The following is a hard-coded constant if the Windows loader
def adjust_FileAlignment(self, val, file_alignment):
if file_alignment > FILE_ALIGNMENT_HARDCODED_VALUE:
# If it's not a power of two, report it:
if self.FileAlignment_Warning is False and not power_of_two(file_alignment):
self.__warnings.append(
"If FileAlignment > 0x200 it should be a power of 2. Value: %x"
% (file_alignment)
)
self.FileAlignment_Warning = True
return cache_adjust_FileAlignment(val, file_alignment)
# According to the document:
# [ Microsoft Portable Executable and Common Object File Format Specification ]
# "The alignment (in bytes) of sections when they are loaded into memory. It must be
# greater than or equal to FileAlignment. The default is the page size for the
# architecture."
#
def adjust_SectionAlignment(self, val, section_alignment, file_alignment):
if file_alignment < FILE_ALIGNMENT_HARDCODED_VALUE:
if (
file_alignment != section_alignment
and self.SectionAlignment_Warning is False
):
self.__warnings.append(
"If FileAlignment(%x) < 0x200 it should equal SectionAlignment(%x)"
% (file_alignment, section_alignment)
)
self.SectionAlignment_Warning = True
return cache_adjust_SectionAlignment(val, section_alignment, file_alignment)
def main():
import sys
usage = """\
pefile.py <filename>
pefile.py exports <filename>"""
if not sys.argv[1:]:
print(usage)
elif sys.argv[1] == "exports":
if not sys.argv[2:]:
sys.exit("error: <filename> required")
pe = PE(sys.argv[2])
for exp in pe.DIRECTORY_ENTRY_EXPORT.symbols:
print(
hex(pe.OPTIONAL_HEADER.ImageBase + exp.address), exp.name, exp.ordinal
)
else:
print(PE(sys.argv[1]).dump_info())
if __name__ == "__main__":
main()