diff --git a/.idea/objutils@1.iml b/.idea/objutils@1.iml index 8e33d7d..6276174 100644 --- a/.idea/objutils@1.iml +++ b/.idea/objutils@1.iml @@ -8,4 +8,4 @@ - \ No newline at end of file + diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 12a3299..aea4434 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -35,14 +35,14 @@ repos: - id: trailing-whitespace - repo: https://github.com/psf/black-pre-commit-mirror - rev: 26.5.0 + rev: 26.5.1 hooks: - id: black language_version: python3.13 args: ["--line-length=132"] - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.15.13 + rev: v0.15.14 hooks: - id: ruff args: ["--fix", "--exit-non-zero-on-fix"] diff --git a/objutils/elf/defs.py b/objutils/elf/defs.py index 620e7d0..7062a74 100644 --- a/objutils/elf/defs.py +++ b/objutils/elf/defs.py @@ -305,8 +305,8 @@ class ELFMachineType(enum.IntEnum): EM_ETPU = 178 # Freescale Extended Time Processing Unit. EM_SLE9X = 179 # Infineon Technologies SLE9X core. EM_L1OM = 180 # Intel L1OM. - EM_K10M = 181 # Intel K10M - EM_AARCH64 = 183 # ARM AArch64 + EM_K10M = 181 # Intel K10M + EM_AARCH64 = 183 # ARM AArch64 EM_AVR32 = 185 # Atmel Corporation 32-bit microprocessor family. EM_STM8 = 186 # STMicroeletronics STM8 8-bit microcontroller. EM_TILE64 = 187 # Tilera TILE64 multicore architecture family. @@ -314,40 +314,40 @@ class ELFMachineType(enum.IntEnum): EM_MICROBLAZE = 189 # Xilinx MicroBlaze 32-bit RISC soft processor core. EM_CUDA = 190 # NVIDIA CUDA architecture. EM_TILEGX = 191 # Tilera TILE-Gx - EM_CLOUDSHIELD = 192, # CloudShield architecture family - EM_COREA_1ST = 193, # KIPO-KAIST Core-A 1st generation processor family - EM_COREA_2ND = 194, # KIPO-KAIST Core-A 2nd generation processor family - EM_ARC_COMPACT2 = 195, # Synopsys ARCompact V2 - EM_OPEN8 = 196, # Open8 8-bit RISC soft processor core - EM_RL78 = 197, # Renesas RL78 family - EM_VIDEOCORE5 = 198, # Broadcom VideoCore V processor - EM_78KOR = 199, # Renesas 78KOR family - EM_56800EX = 200, # Freescale 56800EX Digital Signal Controller (DSC) - EM_BA1 = 201, # Beyond BA1 CPU architecture - EM_BA2 = 202, # Beyond BA2 CPU architecture - EM_XCORE = 203, # XMOS xCORE processor family - EM_MCHP_PIC = 204, # Microchip 8-bit PIC(r) family - EM_INTEL205 = 205, # Reserved by Intel - EM_INTEL206 = 206, # Reserved by Intel - EM_INTEL207 = 207, # Reserved by Intel - EM_INTEL208 = 208, # Reserved by Intel - EM_INTEL209 = 209, # Reserved by Intel - EM_KM32 = 210, # KM211 KM32 32-bit processor - EM_KMX32 = 211, # KM211 KMX32 32-bit processor - EM_KMX16 = 212, # KM211 KMX16 16-bit processor - EM_KMX8 = 213, # KM211 KMX8 8-bit processor - EM_KVARC = 214, # KM211 KVARC processor - EM_CDP = 215, # Paneve CDP architecture family - EM_COGE = 216, # Cognitive Smart Memory Processor - EM_COOL = 217, # iCelero CoolEngine - EM_NORC = 218, # Nanoradio Optimized RISC - EM_CSR_KALIMBA = 219, # CSR Kalimba architecture family - EM_AMDGPU = 224, # AMD GPU architecture - EM_RISCV = 243, # RISC-V - EM_LANAI = 244, # Lanai 32-bit processor - EM_BPF = 247, # Linux kernel bpf virtual machine - EM_VE = 251, # NEC SX-Aurora VE - EM_CSKY = 252, # C-SKY 32-bit processor + EM_CLOUDSHIELD = (192,) # CloudShield architecture family + EM_COREA_1ST = (193,) # KIPO-KAIST Core-A 1st generation processor family + EM_COREA_2ND = (194,) # KIPO-KAIST Core-A 2nd generation processor family + EM_ARC_COMPACT2 = (195,) # Synopsys ARCompact V2 + EM_OPEN8 = (196,) # Open8 8-bit RISC soft processor core + EM_RL78 = (197,) # Renesas RL78 family + EM_VIDEOCORE5 = (198,) # Broadcom VideoCore V processor + EM_78KOR = (199,) # Renesas 78KOR family + EM_56800EX = (200,) # Freescale 56800EX Digital Signal Controller (DSC) + EM_BA1 = (201,) # Beyond BA1 CPU architecture + EM_BA2 = (202,) # Beyond BA2 CPU architecture + EM_XCORE = (203,) # XMOS xCORE processor family + EM_MCHP_PIC = (204,) # Microchip 8-bit PIC(r) family + EM_INTEL205 = (205,) # Reserved by Intel + EM_INTEL206 = (206,) # Reserved by Intel + EM_INTEL207 = (207,) # Reserved by Intel + EM_INTEL208 = (208,) # Reserved by Intel + EM_INTEL209 = (209,) # Reserved by Intel + EM_KM32 = (210,) # KM211 KM32 32-bit processor + EM_KMX32 = (211,) # KM211 KMX32 32-bit processor + EM_KMX16 = (212,) # KM211 KMX16 16-bit processor + EM_KMX8 = (213,) # KM211 KMX8 8-bit processor + EM_KVARC = (214,) # KM211 KVARC processor + EM_CDP = (215,) # Paneve CDP architecture family + EM_COGE = (216,) # Cognitive Smart Memory Processor + EM_COOL = (217,) # iCelero CoolEngine + EM_NORC = (218,) # Nanoradio Optimized RISC + EM_CSR_KALIMBA = (219,) # CSR Kalimba architecture family + EM_AMDGPU = (224,) # AMD GPU architecture + EM_RISCV = (243,) # RISC-V + EM_LANAI = (244,) # Lanai 32-bit processor + EM_BPF = (247,) # Linux kernel bpf virtual machine + EM_VE = (251,) # NEC SX-Aurora VE + EM_CSKY = (252,) # C-SKY 32-bit processor EM_AVR_OLD = 0x1057 # AVR magic number. Written in the absense of an ABI. EM_MSP430_OLD = 0x1059 # MSP430 magic number. Written in the absense of everything. EM_MT = 0x2530 # Morpho MT. Written in the absense of an ABI. @@ -954,7 +954,7 @@ class ELFAbiType(enum.IntEnum): # ELFOSABI_FIRST_ARCH = 64, // First architecture-specific OS ABI # ELFOSABI_AMDGPU_HSA = 64, // AMD HSA runtime ELFOSABI_C6000_LINUX = 65 # Linux TMS320C6000 - ELFOSABI_AMDGPU_MESA3D = 66 # AMD GCN GPUs (GFX6+) for MESA runtime + ELFOSABI_AMDGPU_MESA3D = 66 # AMD GCN GPUs (GFX6+) for MESA runtime ELFOSABI_ARM = 97 # ARM ELFOSABI_STANDALONE = 255 # Standalone (embedded) application diff --git a/objutils/pecoff/pdb/__init__.py b/objutils/pecoff/pdb/__init__.py index f52d1e2..13e93c3 100644 --- a/objutils/pecoff/pdb/__init__.py +++ b/objutils/pecoff/pdb/__init__.py @@ -1,1446 +1,1451 @@ -#!/usr/bin/env python - -"""PDB debug symbol integration for PE/COFF files (Windows only). - -This module provides access to Microsoft Program Database (PDB) debug information -using the Windows dbghelp.dll API. It enables comprehensive symbol lookup beyond -the typically stripped COFF symbol table in release binaries. - -**Platform Support**: Windows only (requires dbghelp.dll) - -Overview: - PDB files contain rich debug information: - - - **Symbols**: Function names, variables, constants - - **Types**: Structures, unions, enums, typedefs - - **Source Info**: File names, line numbers - - **Call Frames**: Stack unwinding data - - ``` - PE File + PDB: - ┌──────────────┐ ┌──────────────┐ - │ app.exe │────>│ app.pdb │ - │ │ │ │ - │ Code │ │ - Symbols │ - │ Data │ │ - Types │ - │ (stripped) │ │ - Lines │ - └──────────────┘ └──────────────┘ - ``` - -Architecture: - **Windows dbghelp.dll**: - - Microsoft's debug helper library - - Symbol server support - - Handles PDB loading and parsing - - Provides symbol enumeration API - - **Symbol Enumeration**: - 1. Initialize dbghelp session (SymInitialize) - 2. Load PE module (SymLoadModuleExW) - 3. Set symbol search paths - 4. Enumerate symbols (SymEnumSymbolsA with callback) - 5. Extract type information (optional) - 6. Cleanup (SymCleanup) - - **Type Information Extraction**: - - Uses dbghelp type info API (SymGetTypeInfo) - - Recursively resolves pointers, arrays, structs - - Extracts sizes, offsets, field names - -Usage Examples: - **Basic Symbol Extraction**: - ```python - from objutils.pecoff.pdb import pdb_symbols_for_pe - - # Load symbols from PDB - symbols = pdb_symbols_for_pe("app.exe") - - for sym in symbols: - print(f"{sym['name']:40s} @ {sym['address']:#010x}") - ``` - - **With Symbol Search Path**: - ```python - # Search multiple directories for PDB - symbols = pdb_symbols_for_pe( - "app.exe", - symbol_path="C:\\Symbols;SRV*C:\\SymCache*https://msdl.microsoft.com/download/symbols" - ) - ``` - - **Advanced Session Management**: - ```python - from objutils.pecoff.pdb import PdbSession - - with PdbSession("app.exe", symbol_path=[".", "C:\\Symbols"]) as pdb: - # Enumerate all symbols - for sym in pdb.enum_symbols(): - if sym.is_function(): - print(f"Function: {sym.name} @ {sym.Address:#x}") - - # Get module info - info = pdb.get_module_info() - print(f"Module base: {info.base_of_dll:#x}") - ``` - - **Type Information Extraction**: - ```python - from objutils.pecoff.pdb import CTypeInfoDump - - # Extract C type definitions - type_dumper = CTypeInfoDump(pdb_session.handle, base_address) - type_info = type_dumper.get_type_from_type_index(type_idx) - print(f"Type: {type_info['type_name']}, Size: {type_info['size']}") - ``` - -Key Components: - **Enums**: - - **SymTagEnum**: Symbol tag types (function, data, UDT, etc.) - - **BasicType**: Primitive types (int, float, void, etc.) - - **SymFlag**: Symbol flags (export, local, function, etc.) - - **IMAGEHLP_SYMBOL_TYPE_INFO**: Type info query constants - - **Data Classes**: - - **ModuleInfo**: Module metadata (base address, size, entry point) - - **SYMBOL_INFO**: Symbol information structure (ctypes) - - **MODULEINFO**: Windows API module info structure - - **Core Classes**: - - **CTypeInfoDump**: Type information extraction and resolution - - **PdbSession**: Manages dbghelp.dll lifetime and operations - -dbghelp.dll API: - The module wraps these key dbghelp functions: - - - **SymInitialize**: Initialize symbol handler - - **SymCleanup**: Cleanup symbol handler - - **SymLoadModuleExW**: Load module for symbol resolution - - **SymEnumSymbolsA**: Enumerate symbols with callback - - **SymGetTypeInfo**: Query type information - - **SymSetSearchPath/SymGetSearchPath**: Symbol path management - -Symbol Search Paths: - dbghelp supports flexible symbol search: - - - **Local paths**: "C:\\Symbols;D:\\Debug" - - **Symbol servers**: "SRV*C:\\Cache*https://msdl.microsoft.com/download/symbols" - - **Combined**: "C:\\Local;SRV*C:\\Cache*https://server" - - The `_NT_SYMBOL_PATH` environment variable is respected. - -Limitations: - - **Windows only**: Requires dbghelp.dll (unavailable on Linux/Mac) - - **PDB required**: Release binaries typically lack embedded COFF symbols - - **Architecture match**: PDB must match PE architecture (x86/x64) - - **Version match**: PDB should match PE build (GUID/age check) - - **Type info**: Complex recursive structures may have limitations - -Error Handling: - On non-Windows platforms, dbghelp/kernel32/psapi are set to None: - - ```python - from objutils.pecoff.pdb import _WINDOWS - - if not _WINDOWS: - print("PDB support unavailable (not Windows)") - ``` - - Import errors are caught and gracefully handled in __init__.py. - -See Also: - - objutils.pecoff: Main PE parser that uses this module - - objutils.pecoff.defs: PE/COFF constants - - objutils.elf.model: Similar ORM pattern for ELF - - Microsoft dbghelp.dll documentation - - PDB format specification - -Example Integration: - ```python - from objutils.pecoff import PeParser - - # PeParser automatically attempts PDB loading - pe = PeParser("kernel32.dll", pdb_path=["C:\\Symbols"]) - - # Symbols now include PDB data if found - for sym in pe.symbols: - print(f"{sym['name']}: {sym['value']:#x}") - ``` -""" - -import ctypes -import enum -from copy import copy -from ctypes import wintypes -from dataclasses import dataclass -from enum import IntEnum -from functools import lru_cache -from typing import Any, Optional - -# DLLs -try: - dbghelp = ctypes.WinDLL("dbghelp") # type: ignore[attr-defined] - kernel32 = ctypes.WinDLL("kernel32") # type: ignore[attr-defined] - psapi = ctypes.WinDLL("psapi") - _WINDOWS = True -except OSError: # pragma: no cover - non-Windows environment - dbghelp = None # type: ignore[assignment] - kernel32 = None # type: ignore[assignment] - psapi = None - _WINDOWS = False - -from objutils import symbols - - -@dataclass -class ModuleInfo: - """Module metadata extracted from Windows process. - - Attributes: - base_of_dll: Base address where module is loaded in memory - size_of_image: Size of module in memory (bytes) - entry_point: Address of module entry point (or None) - - Example: - ```python - info = pdb_session.get_module_info() - print(f"Module: {info.base_of_dll:#x} - {info.base_of_dll + info.size_of_image:#x}") - ``` - """ - - base_of_dll: int - size_of_image: int - entry_point: Optional[int] - - -# Types -HANDLE = wintypes.HANDLE -HLOCAL = wintypes.HANDLE -DWORD = wintypes.DWORD -ULONG = wintypes.ULONG -ULONG64 = ctypes.c_ulonglong -BOOL = wintypes.BOOL -LPVOID = wintypes.LPVOID -LPCWSTR = wintypes.LPCWSTR -LPCSTR = wintypes.LPCSTR - - -class VARTYPE(IntEnum): - """OLE Automation variant type discriminator (vt field of VARIANT).""" - - VT_EMPTY = 0 - VT_NULL = 1 - VT_I2 = 2 - VT_I4 = 3 - VT_R4 = 4 - VT_R8 = 5 - VT_BSTR = 8 - VT_BOOL = 11 - VT_I1 = 16 - VT_UI1 = 17 - VT_UI2 = 18 - VT_UI4 = 19 - VT_I8 = 20 - VT_UI8 = 21 - VT_INT = 22 - VT_UINT = 23 - - -class _VARIANT_VALUE(ctypes.Union): - """Inner value union of a COM VARIANT (covers numeric and pointer cases).""" - - _fields_ = [ - ("llVal", ctypes.c_longlong), - ("lVal", ctypes.c_long), - ("bVal", ctypes.c_ubyte), - ("iVal", ctypes.c_short), - ("fltVal", ctypes.c_float), - ("dblVal", ctypes.c_double), - ("boolVal", ctypes.c_short), - ("scode", ctypes.c_long), - ("cVal", ctypes.c_int8), - ("uiVal", ctypes.c_ushort), - ("ulVal", ctypes.c_ulong), - ("ullVal", ctypes.c_ulonglong), - ("intVal", ctypes.c_int), - ("uintVal", ctypes.c_uint), - ("byref", ctypes.c_void_p), - ] - - -class VARIANT(ctypes.Structure): - """Minimal ctypes representation of the OLE Automation VARIANT structure. - - The full COM VARIANT is a discriminated union keyed on the ``vt`` field. - Only the scalar numeric types that can appear as PDB constant values are - covered here; pointer/array/record sub-types are not needed. - - Total size is 16 bytes (matching the Windows ABI definition). - """ - - _fields_ = [ - ("vt", ctypes.c_ushort), - ("wReserved1", ctypes.c_ushort), - ("wReserved2", ctypes.c_ushort), - ("wReserved3", ctypes.c_ushort), - ("_value", _VARIANT_VALUE), - ] - - -def _variant_to_python(variant): - """Convert a VARIANT value to an appropriate Python primitive. - - Only the numeric VARTYPE values that are relevant for PDB constant symbols - are handled. Unknown or unsupported types return ``None``. - """ - try: - kind = VARTYPE(variant.vt) - except ValueError: - return None - v = variant._value - _map = { - VARTYPE.VT_I1: lambda: int(v.cVal), - VARTYPE.VT_I2: lambda: int(v.iVal), - VARTYPE.VT_I4: lambda: int(v.lVal), - VARTYPE.VT_I8: lambda: int(v.llVal), - VARTYPE.VT_UI1: lambda: int(v.bVal), - VARTYPE.VT_UI2: lambda: int(v.uiVal), - VARTYPE.VT_UI4: lambda: int(v.ulVal), - VARTYPE.VT_UI8: lambda: int(v.ullVal), - VARTYPE.VT_INT: lambda: int(v.intVal), - VARTYPE.VT_UINT: lambda: int(v.uintVal), - VARTYPE.VT_R4: lambda: float(v.fltVal), - VARTYPE.VT_R8: lambda: float(v.dblVal), - VARTYPE.VT_BOOL: lambda: bool(v.boolVal), - } - fn = _map.get(kind) - return fn() if fn is not None else None - - -class TI_FINDCHILDREN_PARAMS(ctypes.Structure): - _fields_ = [ - ("Count", ULONG), - ("Start", ULONG), - ("ChildId", ULONG * 1), - ] - - -# SYMBOL_INFO struct (ANSI) -MAX_SYM_NAME = 2000 - - -class SYMBOL_INFO(ctypes.Structure): - """Windows API structure for symbol information. - - Used with dbghelp.dll SymEnumSymbolsA to enumerate symbols. - Contains detailed information about a symbol including name, address, - flags, and type information. - - Key Attributes: - Name: Symbol name (null-terminated char array) - Address: Absolute address in memory - ModBase: Module base address - Flags: Symbol flags (SymFlag enum values) - Tag: Symbol tag type (SymTagEnum values) - Size: Symbol size in bytes - Value: Symbol value (for constants) - - Helper Methods: - is_function(): True if symbol is a function - is_export(): True if symbol is exported - is_local(): True if symbol is local variable - is_parameter(): True if symbol is function parameter - decode_flags(): List of flag names - - Properties: - name: Decoded symbol name (str) - tag: Symbol tag name (str) - rel_address: Relative address (Address - ModBase) - - Example: - ```python - # Used in enumeration callback - def callback(sym_info, size, context): - sym = ctypes.cast(sym_info, ctypes.POINTER(SYMBOL_INFO)).contents - if sym.is_function(): - print(f"Function: {sym.name} @ {sym.Address:#x}") - return True # Continue enumeration - ``` - """ - - def is_clr_token(self) -> bool: - """Check if symbol is a CLR token (.NET managed code).""" - return bool(self.Flags & SymFlag.SYMFLAG_CLR_TOKEN) - return bool(self.Flags & SymFlag.SYMFLAG_CLR_TOKEN) - - def is_constant(self) -> bool: - """Check if symbol is a constant value.""" - return bool(self.Flags & SymFlag.SYMFLAG_CONSTANT) - - def is_export(self) -> bool: - """Check if symbol is exported from module.""" - return bool(self.Flags & SymFlag.SYMFLAG_EXPORT) - - def is_forwarder(self) -> bool: - """Check if symbol is an export forwarder.""" - return bool(self.Flags & SymFlag.SYMFLAG_FORWARDER) - - def is_framerel(self) -> bool: - """Check if symbol is frame-relative (stack variable).""" - return bool(self.Flags & SymFlag.SYMFLAG_FRAMEREL) - - def is_function(self) -> bool: - """Check if symbol is a function.""" - return bool(self.Flags & SymFlag.SYMFLAG_FUNCTION) - - def is_ilrel(self) -> bool: - """Check if symbol is IL-relative (.NET managed code).""" - return bool(self.Flags & SymFlag.SYMFLAG_ILREL) - - def is_local(self) -> bool: - """Check if symbol is a local variable.""" - return bool(self.Flags & SymFlag.SYMFLAG_LOCAL) - - def is_metadata(self) -> bool: - """Check if symbol is metadata.""" - return bool(self.Flags & SymFlag.SYMFLAG_METADATA) - - def is_parameter(self) -> bool: - """Check if symbol is a function parameter.""" - return bool(self.Flags & SymFlag.SYMFLAG_PARAMETER) - - def is_register(self) -> bool: - """Check if symbol is in a register.""" - return bool(self.Flags & SymFlag.SYMFLAG_REGISTER) - - def is_regrel(self) -> bool: - """Check if symbol is register-relative.""" - return bool(self.Flags & SymFlag.SYMFLAG_REGREL) - - def is_slot(self) -> bool: - """Check if symbol is a slot (.NET managed code).""" - return bool(self.Flags & SymFlag.SYMFLAG_SLOT) - - def is_thunk(self) -> bool: - """Check if symbol is a thunk (jump stub).""" - return bool(self.Flags & SymFlag.SYMFLAG_THUNK) - - def is_tlsrel(self) -> bool: - """Check if symbol is thread-local storage relative.""" - return bool(self.Flags & SymFlag.SYMFLAG_TLSREL) - - def is_value_present(self) -> bool: - """Check if symbol has value field populated.""" - return bool(self.Flags & SymFlag.SYMFLAG_VALUEPRESENT) - - def is_virtual(self) -> bool: - """Check if symbol is virtual.""" - return bool(self.Flags & SymFlag.SYMFLAG_VIRTUAL) - - # @cached_property - def decode_flags(self) -> list[str]: - """Decode Flags field to list of flag names. - - Returns: - List of flag names (e.g., ["SYMFLAG_FUNCTION", "SYMFLAG_EXPORT"]) - """ - return [f.name for f in SymFlag if self.Flags & f.value] - - # @cached_property - @property - def name(self): - """Get symbol name as decoded string. - - Returns: - Symbol name (str), ignoring decode errors - """ - return self.Name.decode(errors="ignore") - - # @cached_property - @property - def tag(self): - """Get symbol tag name. - - Returns: - Tag name (e.g., "SymTagFunction"), or "SymTagNull" if invalid - """ - try: - return SymTagEnum(self.Tag).name - except ValueError: - return SymTagEnum.SymTagNull.name - - # @cached_property - @property - def rel_address(self): - """Get symbol address relative to module base. - - Returns: - Relative virtual address (RVA) - """ - return self.Address - (0 if self.ModBase is None else self.ModBase) - - def __repr__(self) -> str: - name = self.Name.decode(errors="ignore") - return f"" - - _fields_ = [ - ("SizeOfStruct", ULONG), - ("TypeIndex", ULONG), - ("Reserved", ULONG64 * 2), - ("Index", ULONG), - ("Size", ULONG), - ("ModBase", ULONG64), - ("Flags", ULONG), - ("Value", ULONG64), - ("Address", ULONG64), - ("Register", ULONG), - ("Scope", ULONG), - ("Tag", ULONG), - ("NameLen", ULONG), - ("MaxNameLen", ULONG), - ("Name", ctypes.c_char * (MAX_SYM_NAME + 1)), - ] - - -# MODULEINFO struct -class MODULEINFO(ctypes.Structure): - """Windows API structure for module information (from psapi.dll). - - Used with GetModuleInformation to query module metadata. - """ - - _fields_ = [ - ("lpBaseOfDll", LPVOID), - ("SizeOfImage", DWORD), - ("EntryPoint", LPVOID), - ] - - -class SymTagEnum(IntEnum): - """Symbol tag types for PDB symbols. - - Defines the kind of symbol (function, data, type, etc.). - Used in SYMBOL_INFO.Tag field. - - Common Values: - SymTagFunction (5): Function symbol - SymTagData (7): Variable symbol - SymTagPublicSymbol (10): Exported symbol - SymTagUDT (11): User-defined type (struct/class) - SymTagEnum (12): Enumeration type - SymTagPointerType (14): Pointer type - SymTagArrayType (15): Array type - SymTagBaseType (16): Primitive type - """ - - SymTagNull = 0 - SymTagExe = 1 - SymTagCompiland = 2 - SymTagCompilandDetails = 3 - SymTagCompilandEnv = 4 - SymTagFunction = 5 - SymTagBlock = 6 - SymTagData = 7 - SymTagAnnotation = 8 - SymTagLabel = 9 - SymTagPublicSymbol = 10 - SymTagUDT = 11 - SymTagEnum = 12 - SymTagFunctionType = 13 - SymTagPointerType = 14 - SymTagArrayType = 15 - SymTagBaseType = 16 - SymTagTypedef = 17 - SymTagBaseClass = 18 - SymTagFriend = 19 - SymTagFunctionArgType = 20 - SymTagFuncDebugStart = 21 - SymTagFuncDebugEnd = 22 - SymTagUsingNamespace = 23 - SymTagVTableShape = 24 - SymTagVTable = 25 - SymTagCustom = 26 - SymTagThunk = 27 - SymTagCustomType = 29 - SymTagManagedType = 30 - SymTagDimension = 31 - - -class UdtKind(enum.IntEnum): - UdtStruct = 0 - UdtClass = 1 - UdtUnion = 2 - - -class IMAGEHLP_SYMBOL_TYPE_INFO(IntEnum): - """Constants for SymGetTypeInfo queries. - - Used with dbghelp.SymGetTypeInfo to query type information. - - Common Queries: - TI_GET_SYMTAG (0): Get symbol tag - TI_GET_SYMNAME (1): Get symbol name - TI_GET_LENGTH (2): Get type size in bytes - TI_GET_TYPE (3): Get type index - TI_GET_BASETYPE (5): Get base type (BasicType enum) - TI_GET_CHILDRENCOUNT (13): Get count of child members - TI_GET_OFFSET (10): Get member offset in struct - """ - - TI_GET_SYMTAG = 0 - TI_GET_SYMNAME = 1 - TI_GET_LENGTH = 2 - TI_GET_TYPE = 3 - TI_GET_TYPEID = 4 - TI_GET_BASETYPE = 5 - TI_GET_ARRAYINDEXTYPEID = 6 - TI_FINDCHILDREN = 7 - TI_GET_DATAKIND = 8 - TI_GET_ADDRESSOFFSET = 9 - TI_GET_OFFSET = 10 - TI_GET_VALUE = 11 - TI_GET_COUNT = 12 - TI_GET_CHILDRENCOUNT = 13 - TI_GET_BITPOSITION = 14 - TI_GET_VIRTUALBASECLASS = 15 - TI_GET_VIRTUALTABLESHAPEID = 16 - TI_GET_VIRTUALBASEPOINTEROFFSET = 17 - TI_GET_CLASSTYPEID = 18 - TI_GET_NESTED = 19 - TI_GET_SYMINDEX = 20 - TI_GET_LEXICALPARENT = 21 - TI_GET_ADDRESS = 22 - TI_GET_THISADJUST = 23 - TI_GET_UDTKIND = 24 - TI_IS_EQUIV_TO = 25 - TI_GET_CALLING_CONVENTION = 26 - TI_IS_CLOSE_EQUIV_TO = 27 - TI_GTIEX_REQS_VALID = 28 - TI_GET_VIRTUALBASEOFFSET = 29 - TI_GET_VIRTUALBASEDISPINDEX = 30 - TI_GET_IS_REFERENCE = 31 - TI_GET_INDIRECTVIRTUALBASEDISPINDEX = 32 - TI_GET_VIRTUALBASETABLETYPEID = 33 - TI_GET_OBJECTPOINTERTYPEID = 34 - TI_GET_IS_CONST = 35 - TI_GET_IS_VOLATILE = 36 - TI_GET_IS_UNALIGNED = 37 - - -class BasicType(IntEnum): - """Primitive type identifiers for PDB types. - - Used with TI_GET_BASETYPE query to identify base types. - - Common Types: - btVoid (1): void type - btChar (2): char type - btInt (6): signed integer - btUInt (7): unsigned integer - btFloat (8): floating point - btBool (10): boolean - btLong (13): long integer - btULong (14): unsigned long - """ - - btNoType = 0 - btVoid = 1 - btChar = 2 - btWChar = 3 - btInt = 6 - btUInt = 7 - btFloat = 8 - btBCD = 9 - btBool = 10 - btLong = 13 - btULong = 14 - btCurrency = 25 - btDate = 26 - btVariant = 27 - btComplex = 28 - btBit = 29 - btBSTR = 30 - btHresult = 31 - btChar16 = 32 - btChar32 = 33 - btChar8 = 34 - - -PRIMITIVE_TYPEMAP = { - BasicType.btNoType: "", - BasicType.btVoid: "void", - BasicType.btChar: "char", - BasicType.btWChar: "wchar", - BasicType.btInt: "signed int", - BasicType.btUInt: "unsigned int", - BasicType.btFloat: "float", - BasicType.btBCD: "BCD", - BasicType.btBool: "bool", - BasicType.btLong: "long", - BasicType.btULong: "unsigned long", - BasicType.btCurrency: "CURRENCY", - BasicType.btDate: "DATE", - BasicType.btVariant: "VARIANT", - BasicType.btComplex: "complex", - BasicType.btBit: "BIT", - BasicType.btBSTR: "BSTR", - BasicType.btHresult: "HRESULT", - BasicType.btChar16: "Char16", - BasicType.btChar32: "Char32", - BasicType.btChar8: "Char8", -} - - -class DataKind(enum.IntEnum): - DataIsUnknown = 0 - DataIsLocal = 1 - DataIsStaticLocal = 2 - DataIsParam = 3 - DataIsObjectPtr = 4 - DataIsFileStatic = 5 - DataIsGlobal = 6 - DataIsMember = 7 - DataIsStaticMember = 8 - DataIsConstant = 9 - - -class SymFlag(IntEnum): - """Symbol flags for SYMBOL_INFO.Flags field. - - Bit flags indicating symbol properties. - - Common Flags: - SYMFLAG_FUNCTION (0x800): Symbol is a function - SYMFLAG_EXPORT (0x200): Symbol is exported - SYMFLAG_LOCAL (0x80): Symbol is local variable - SYMFLAG_PARAMETER (0x40): Symbol is function parameter - SYMFLAG_REGISTER (0x8): Symbol is in register - SYMFLAG_CONSTANT (0x100): Symbol is a constant - SYMFLAG_VALUEPRESENT (0x1): Value field is valid - """ - - SYMFLAG_VALUEPRESENT = 0x00000001 - SYMFLAG_REGISTER = 0x00000008 - SYMFLAG_REGREL = 0x00000010 - SYMFLAG_FRAMEREL = 0x00000020 - SYMFLAG_PARAMETER = 0x00000040 - SYMFLAG_LOCAL = 0x00000080 - SYMFLAG_CONSTANT = 0x00000100 - SYMFLAG_EXPORT = 0x00000200 - SYMFLAG_FORWARDER = 0x00000400 - SYMFLAG_FUNCTION = 0x00000800 - SYMFLAG_VIRTUAL = 0x00001000 - SYMFLAG_THUNK = 0x00002000 - SYMFLAG_TLSREL = 0x00004000 - SYMFLAG_SLOT = 0x00008000 - SYMFLAG_ILREL = 0x00010000 - SYMFLAG_METADATA = 0x00020000 - SYMFLAG_CLR_TOKEN = 0x00040000 - - -# Prototypes -if _WINDOWS: - dbghelp.SymInitialize.argtypes = [HANDLE, LPCWSTR, BOOL] - dbghelp.SymInitialize.restype = BOOL - - dbghelp.SymCleanup.argtypes = [HANDLE] - dbghelp.SymCleanup.restype = BOOL - - dbghelp.SymSetOptions.argtypes = [DWORD] - dbghelp.SymSetOptions.restype = DWORD - - dbghelp.SymGetOptions.argtypes = [] - dbghelp.SymGetOptions.restype = DWORD - - dbghelp.SymLoadModuleExW.argtypes = [HANDLE, HANDLE, LPCWSTR, LPCWSTR, ULONG64, DWORD, LPVOID, DWORD] - dbghelp.SymLoadModuleExW.restype = ULONG64 # returns base - - dbghelp.SymSetSearchPath.argtypes = [HANDLE, LPCSTR] - dbghelp.SymSetSearchPath.restype = BOOL - - dbghelp.SymGetSearchPath.argtypes = [HANDLE, ctypes.c_char_p, DWORD] - dbghelp.SymGetSearchPath.restype = BOOL - -# SymEnumSymbolsA callback and function -if _WINDOWS: - PSYM_ENUMERATESYMBOLS_CALLBACK = ctypes.WINFUNCTYPE( - BOOL, - ctypes.POINTER(SYMBOL_INFO), - ULONG, - LPVOID, - ) - -if _WINDOWS: - dbghelp.SymEnumSymbols.argtypes = [HANDLE, ULONG64, LPCSTR, PSYM_ENUMERATESYMBOLS_CALLBACK, LPVOID] - dbghelp.SymEnumSymbols.restype = BOOL - -# SymFromAddr -if _WINDOWS: - dbghelp.SymFromAddr.argtypes = [HANDLE, ULONG64, ctypes.POINTER(ULONG64), ctypes.POINTER(SYMBOL_INFO)] - dbghelp.SymFromAddr.restype = BOOL - - dbghelp.SymGetTypeInfo.argtypes = [HANDLE, ULONG64, ULONG, ctypes.c_int, LPVOID] - dbghelp.SymGetTypeInfo.restype = BOOL - -# Kernel32 helpers -if _WINDOWS: - kernel32.GetCurrentProcess.restype = HANDLE - psapi.GetModuleInformation.argtypes = [HANDLE, HANDLE, ctypes.POINTER(MODULEINFO), DWORD] - psapi.GetModuleInformation.restype = BOOL - kernel32.GetLastError.restype = DWORD - kernel32.LoadLibraryA.argtypes = [LPCSTR] - kernel32.LoadLibraryA.restype = HANDLE - kernel32.FreeLibrary.argtypes = [HANDLE] - kernel32.FreeLibrary.restype = BOOL - kernel32.LocalFree.argtypes = [HLOCAL] - kernel32.LocalFree.restype = HLOCAL - - -def last_error(): - if not _WINDOWS: - return 0 - return kernel32.GetLastError() - - -# SYMOPT flags (subset) -SYMOPT_DEFERRED_LOADS = 0x00000004 -SYMOPT_UNDNAME = 0x00000002 -SYMOPT_LOAD_LINES = 0x00000010 - - -def load_library(lib_path: str) -> HANDLE: - """Loads the specified module into the address space of the calling process.""" - if not _WINDOWS: - raise OSError("PDB support requires Windows (kernel32.dll)") - handle = kernel32.LoadLibraryA(lib_path.encode("ascii")) - if not handle: - raise OSError(f"LoadLibraryA failed for {lib_path}, error={last_error()}") - return handle - - -def free_library(hmod: HANDLE) -> None: - """Frees the loaded dynamic-link library (DLL) module.""" - if not _WINDOWS: - raise OSError("PDB support requires Windows (kernel32.dll)") - if not kernel32.FreeLibrary(hmod): - raise OSError(f"FreeLibrary failed, error={last_error()}") - - -class CTypeInfoDump: - """Extracts C type information from PDB debug symbols. - - Recursively resolves type definitions including pointers, arrays, - structures, unions, and base types. Uses dbghelp.SymGetTypeInfo - to query type metadata. - - Attributes: - process: dbghelp process handle - mod_base: Module base address - - Type Resolution Algorithm: - 1. Query type tag (pointer, array, UDT, base type, etc.) - 2. For compound types: - - Pointer: Resolve pointed-to type - - Array: Resolve element type and count - - UDT: Enumerate members recursively - 3. Calculate sizes and offsets - 4. Build type dictionary with metadata - - Example: - ```python - type_dumper = CTypeInfoDump(pdb_session.handle, base_address) - - # Get type info for a symbol - type_info = type_dumper.get_type_from_type_index(type_idx) - print(f"Type: {type_info['type_name']}") - print(f"Size: {type_info['size']} bytes") - - # For struct, enumerate members - if 'members' in type_info: - for member in type_info['members']: - print(f" {member['name']}: {member['type_name']} @ +{member['offset']}") - ``` - - Note: - Type resolution can be slow for complex recursive structures. - Use caching when querying multiple symbols. - """ - - def __init__(self, process, mod_base): - """Initialize type info dumper. - - Args: - process: dbghelp process handle from PdbSession - mod_base: Module base address - """ - self.process = process - self.mod_base = mod_base - self._type_cache: dict[int, symbols.TypeInfo] = {} - self._resolving: set[int] = set() - - def get_type_info(self, type_id, info_type): - """Query type information from dbghelp. - - Args: - type_id: Type index to query - info_type: IMAGEHLP_SYMBOL_TYPE_INFO constant - - Returns: - Type information value (type depends on info_type): - - String for TI_GET_SYMNAME - - Integer for TI_GET_LENGTH, TI_GET_COUNT, etc. - - Boolean for TI_GET_IS_CONST, TI_GET_IS_VOLATILE, etc. - - None if query fails - - Note: - Different info_type values return different data types. - Memory for strings (TI_GET_SYMNAME) is automatically freed. - """ - if not _WINDOWS: - return None - if info_type in (IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME,): - ptr = ctypes.c_void_p() - if dbghelp.SymGetTypeInfo(self.process, self.mod_base, type_id, info_type.value, ctypes.byref(ptr)): - if ptr.value: - name = ctypes.wstring_at(ptr) - kernel32.LocalFree(ptr) - return name - elif info_type in ( - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMTAG, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_BASETYPE, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_TYPE, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_TYPEID, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_ARRAYINDEXTYPEID, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_COUNT, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_CHILDRENCOUNT, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_OFFSET, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_DATAKIND, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_UDTKIND, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_ADDRESSOFFSET, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_BITPOSITION, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VIRTUALTABLESHAPEID, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VIRTUALBASEPOINTEROFFSET, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_CLASSTYPEID, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_NESTED, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMINDEX, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LEXICALPARENT, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_THISADJUST, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_IS_EQUIV_TO, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_CALLING_CONVENTION, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_IS_CLOSE_EQUIV_TO, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VIRTUALBASEOFFSET, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VIRTUALBASEDISPINDEX, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_OBJECTPOINTERTYPEID, - ): - out = DWORD() - if dbghelp.SymGetTypeInfo(self.process, self.mod_base, type_id, info_type.value, ctypes.byref(out)): - return out.value - elif info_type == IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VALUE: - out = VARIANT() - if dbghelp.SymGetTypeInfo(self.process, self.mod_base, type_id, info_type.value, ctypes.byref(out)): - return _variant_to_python(out) - elif info_type in (IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_ADDRESS, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GTIEX_REQS_VALID): - out = ULONG64() - if dbghelp.SymGetTypeInfo(self.process, self.mod_base, type_id, info_type.value, ctypes.byref(out)): - return out.value - elif info_type in ( - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_CONST, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_VOLATILE, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_UNALIGNED, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_REFERENCE, - IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VIRTUALBASECLASS, - ): - out = BOOL() - if dbghelp.SymGetTypeInfo(self.process, self.mod_base, type_id, info_type.value, ctypes.byref(out)): - return bool(out.value) - elif info_type == IMAGEHLP_SYMBOL_TYPE_INFO.TI_FINDCHILDREN: - count = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_CHILDRENCOUNT) - if not count: - return [] - - # TI_FINDCHILDREN_PARAMS is a variable-sized structure. - # We need to allocate enough space for Count, Start, and all ChildIds. - size = ctypes.sizeof(TI_FINDCHILDREN_PARAMS) + (count - 1) * ctypes.sizeof(ULONG) - buf = (ctypes.c_char * size)() - params = ctypes.cast(buf, ctypes.POINTER(TI_FINDCHILDREN_PARAMS)) - params.contents.Count = count - params.contents.Start = 0 - - if dbghelp.SymGetTypeInfo(self.process, self.mod_base, type_id, info_type.value, params): - # Access ChildId as an array of length 'count' - child_ids = ctypes.cast(params.contents.ChildId, ctypes.POINTER(ULONG * count)) - return list(child_ids.contents) - return None - - def _get_referenced_type_id(self, type_id: int) -> int | None: - child_id = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_TYPEID) - if child_id is None: - child_id = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_TYPE) - return child_id - - def _wrap_qualifiers(self, type_id: int, tp: symbols.TypeInfo) -> symbols.TypeInfo: - if self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_CONST): - tp = symbols.ConstantType(tp) - if self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_VOLATILE): - tp = symbols.VolatileType(tp) - return tp - - def get_data(self, type_id: int) -> symbols.DataType | None: - tag_val = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMTAG) - if tag_val is None: - return None - tag = SymTagEnum(tag_val) - if tag != SymTagEnum.SymTagData: - return None - name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME) - tp = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_TYPE) - base_type = self.get_full_type_name(tp) if tp is not None else symbols.UnspecifiedType("unknown") - data_kind_value = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_DATAKIND) - try: - data_kind = DataKind(data_kind_value) - except (TypeError, ValueError): - data_kind = DataKind.DataIsUnknown - - value: Any = None - if data_kind == DataKind.DataIsConstant: - value = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VALUE) - elif data_kind in (DataKind.DataIsGlobal, DataKind.DataIsStaticLocal, DataKind.DataIsFileStatic, - DataKind.DataIsStaticMember): - value = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_ADDRESS) - elif data_kind in (DataKind.DataIsLocal, DataKind.DataIsParam, DataKind.DataIsObjectPtr, DataKind.DataIsMember): - value = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_OFFSET) - return symbols.DataType(name or "", value, base_type, data_kind) - - def get_enumerators(self, type_id: int) -> list[symbols.Enumerator]: - tag_val = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMTAG) - if tag_val is None: - return [] - tag = SymTagEnum(tag_val) - if tag != SymTagEnum.SymTagEnum: - return [] - chs = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_FINDCHILDREN) - if not chs: - return [] - enumerators: list[symbols.Enumerator] = [] - for ch in chs: - result = self.get_data(ch) - if result is None: - continue - if isinstance(result.value, int): - enumerators.append(symbols.Enumerator(result.name, result.value)) - return enumerators - - def get_struct(self, type_id: int) -> symbols.StructureType: - name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME) - byte_size = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH) - chs = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_FINDCHILDREN) - members: list[symbols.StructMember] = [] - for ch in chs or []: - result = self.get_data(ch) - if result is None: - continue - offset = result.value if isinstance(result.value, int) else 0 - members.append(symbols.StructMember(result.name, result.type, offset)) - return symbols.StructureType(name or "", int(byte_size or 0), members) - - def get_union(self, type_id: int) -> symbols.UnionType: - name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME) - byte_size = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH) - chs = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_FINDCHILDREN) - alternatives: list[symbols.StructMember] = [] - for ch in chs or []: - result = self.get_data(ch) - if result is None: - continue - alternatives.append(symbols.StructMember(result.name, result.type, 0)) - return symbols.UnionType(name or "", int(byte_size or 0), alternatives) - - def get_class(self, type_id: int) -> symbols.ClassType: - name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME) - byte_size = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH) - chs = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_FINDCHILDREN) - members: list[symbols.ClassMember] = [] - for ch in chs or []: - result = self.get_data(ch) - if result is None: - continue - offset = result.value if isinstance(result.value, int) else 0 - members.append( - symbols.ClassMember( - result.name, - "", - result.type, - offset, - 0, - result.datakind == DataKind.DataIsStaticMember, - ) - ) - return symbols.ClassType(name or "", int(byte_size or 0), members) - - def get_args(self, type_id: int) -> symbols.TypeInfo | None: - tag_val = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMTAG) - if tag_val is None: - return None - tag = SymTagEnum(tag_val) - if tag != SymTagEnum.SymTagFunctionArgType: - return None - tp = self._get_referenced_type_id(type_id) - if tp is None: - return symbols.UnspecifiedType("unknown") - return self.get_full_type_name(tp) - - - def get_function(self, type_id: int) -> symbols.SubroutineType: - name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME) - chs = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_FINDCHILDREN) - tid = self._get_referenced_type_id(type_id) - ret_type: symbols.TypeInfo = symbols.UnspecifiedType("void") - if tid is not None: - ret_type = self.get_full_type_name(tid) - args: list[symbols.TypeInfo] = [] - for ch in chs or []: - arg = self.get_args(ch) - if arg is not None: - args.append(arg) - return symbols.SubroutineType(name or "", 0, ret_type, args) - - def get_full_type_name(self, type_id: int | None) -> symbols.TypeInfo: - if type_id is None: - return symbols.UnspecifiedType("unknown") - if type_id in self._type_cache: - return self._type_cache[type_id] - if type_id in self._resolving: - return symbols.UnspecifiedType(f"recursive_type_{type_id}") - - self._resolving.add(type_id) - try: - resolved = self._resolve_type(type_id) - self._type_cache[type_id] = resolved - return resolved - finally: - self._resolving.discard(type_id) - - def _resolve_type(self, type_id: int) -> symbols.TypeInfo: - tag_val = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMTAG) - if tag_val is None: - return symbols.UnspecifiedType("unknown") - - try: - tag = SymTagEnum(tag_val) - except ValueError: - return symbols.UnspecifiedType(f"unknown_tag_{tag_val}") - - if tag == SymTagEnum.SymTagBaseType: - bt = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_BASETYPE) - length = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH) - if bt is not None: - try: - base_type = BasicType(bt) - except ValueError: - base_type = bt - type_name = PRIMITIVE_TYPEMAP.get(base_type, f"base_{bt}") if isinstance(base_type, BasicType) else f"base_{bt}" - enc = symbols.type_encoding_from_pdb_bt(int(base_type)) - resolved = symbols.PrimitiveType(type_name, enc, int(length or 0)) - return self._wrap_qualifiers(type_id, resolved) - enc = symbols.type_encoding_from_pdb_bt(int(BasicType.btVoid)) - return self._wrap_qualifiers(type_id, symbols.PrimitiveType("void", enc, int(length or 0))) - - elif tag == SymTagEnum.SymTagPointerType: - child_id = self._get_referenced_type_id(type_id) - is_ref = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_REFERENCE) - full_type = self.get_full_type_name(child_id) - if is_ref: - resolved = symbols.ReferenceType(full_type) - else: - resolved = symbols.PointerType(full_type) - return self._wrap_qualifiers(type_id, resolved) - - elif tag == SymTagEnum.SymTagArrayType: - child_id = self._get_referenced_type_id(type_id) - count = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_COUNT) - full_type = self.get_full_type_name(child_id) - byte_size = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH) - if not count and byte_size and isinstance(full_type, symbols.PrimitiveType) and full_type.byte_size: - count = int(byte_size // full_type.byte_size) - if count is not None: - resolved = symbols.ArrayType(full_type, [(0, int(count))]) - else: - resolved = symbols.ArrayType(full_type, [(0, 0)]) - if isinstance(full_type, symbols.ArrayType): - full_type.array_spec.insert(0, (0, count)) # coerce array-specifiers. - return full_type - else: - return self._wrap_qualifiers(type_id, resolved) - elif tag == SymTagEnum.SymTagEnum: - name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME) - tp = self._get_referenced_type_id(type_id) - base_type = self.get_full_type_name(tp) - byte_size = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH) - enumerators = self.get_enumerators(type_id) - encoding = base_type.encoding if isinstance(base_type, symbols.PrimitiveType) else None - resolved = symbols.EnumerationType(name or "", int(byte_size or 0), encoding, base_type, enumerators) - return self._wrap_qualifiers(type_id, resolved) - elif tag == SymTagEnum.SymTagTypedef: - name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME) - tp = self._get_referenced_type_id(type_id) - base_type = self.get_full_type_name(tp) - resolved = symbols.TypeDefiniton(name or "", base_type) - return self._wrap_qualifiers(type_id, resolved) - elif tag == SymTagEnum.SymTagUDT: - udt_kind = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_UDTKIND) - if udt_kind == UdtKind.UdtStruct: - resolved = self.get_struct(type_id) - elif udt_kind == UdtKind.UdtUnion: - resolved = self.get_union(type_id) - elif udt_kind == UdtKind.UdtClass: - resolved = self.get_class(type_id) - else: - name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME) - resolved = symbols.UnspecifiedType(name or "udt") - return self._wrap_qualifiers(type_id, resolved) - elif tag == SymTagEnum.SymTagFunctionType: - return self._wrap_qualifiers(type_id, self.get_function(type_id)) - else: - return symbols.UnspecifiedType(tag.name.lstrip("SymTag")) - - -class PdbSession: - """Manages dbghelp.dll symbol session lifecycle. - - Context manager for dbghelp symbol operations. Handles initialization, - module loading, symbol enumeration, and cleanup. - - The session maintains a dbghelp process handle and configures symbol - search paths. Automatically loads modules and enables symbol options. - - Attributes: - hproc: Process handle (from GetCurrentProcess) - _modules: Dictionary of loaded module bases by path - - Usage: - ```python - # Basic session - with PdbSession(symbol_path=[".", "C:\\Symbols"]) as pdb: - # Enumerate symbols - for sym in pdb.enum_symbols(): - print(f"{sym.name}: {sym.Address:#x}") - - # Load specific module - pdb = PdbSession() - try: - base = pdb.load_module("app.exe") - info = pdb.get_module_info() - print(f"Module loaded at {base:#x}, size {info.size_of_image} bytes") - finally: - pdb.close() - ``` - - Symbol Options: - The session automatically enables: - - SYMOPT_DEFERRED_LOADS: Load symbols on demand - - SYMOPT_UNDNAME: Undecorate C++ symbols - - SYMOPT_LOAD_LINES: Load source line information - - Note: - Always use context manager (with statement) or manually call close() - to ensure proper cleanup of dbghelp resources. - """ - - def __init__(self, symbol_path: list[str] | None = None): - """Initialize dbghelp symbol session. - - Args: - symbol_path: Optional list of directories to search for symbols. - Supports local paths and symbol servers: - - Local: ["C:\\Symbols", "D:\\Debug"] - - Server: ["SRV*C:\\Cache*https://msdl.microsoft.com/download/symbols"] - If None, uses current directory and _NT_SYMBOL_PATH - - Raises: - OSError: If not on Windows or SymInitialize fails - """ - if not _WINDOWS: - raise OSError("PDB support requires Windows (dbghelp.dll)") - - self.hproc = kernel32.GetCurrentProcess() - if symbol_path: - symbol_path_str = ";".join(symbol_path) - else: - symbol_path_str = None - - if not dbghelp.SymInitialize(self.hproc, symbol_path_str, True): - raise OSError(f"SymInitialize failed, error={last_error()}") - - opts = dbghelp.SymGetOptions() - opts |= SYMOPT_DEFERRED_LOADS | SYMOPT_UNDNAME | SYMOPT_LOAD_LINES - dbghelp.SymSetOptions(opts) - self.type_dumper_cache = {} - - @lru_cache - def type_info(self, base: int, type_index: int) -> symbols.TypeInfo: - if type_index: - if base in self.type_dumper_cache: - type_dumper = self.type_dumper_cache[base] - else: - type_dumper = CTypeInfoDump(self.hproc, base) - self.type_dumper_cache[base] = type_dumper - return type_dumper.get_full_type_name(type_index) - return symbols.UnspecifiedType("unknown") - - def cleanup(self) -> None: - """Cleans up the dbghelp session.""" - if _WINDOWS and hasattr(self, "hproc"): - dbghelp.SymCleanup(self.hproc) - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.cleanup() - - def set_search_path(self, search_path: str) -> None: - """Sets the symbol search path for the current session.""" - if not dbghelp.SymSetSearchPath(self.hproc, search_path.encode("ascii")): - raise OSError(f"SymSetSearchPath failed, error={last_error()}") - - def get_search_path(self) -> str: - """Gets the symbol search path for the current session.""" - buffer = ctypes.create_string_buffer(2048) - if not dbghelp.SymGetSearchPath(self.hproc, buffer, ctypes.sizeof(buffer)): - raise OSError(f"SymGetSearchPath failed, error={last_error()}") - return buffer.value.decode("ascii") - - def load_module(self, file_path: str) -> int: - """Loads a module for the current session.""" - file_path = str(file_path) - base = dbghelp.SymLoadModuleExW(self.hproc, None, file_path, None, 0, 0, None, 0) - if base == 0: - raise OSError(f"SymLoadModuleExW failed for {file_path}, error={last_error()}") - return base - - def enum_symbols(self, base: int, pattern: bytes = b"*") -> list[dict]: # Generator[]: - """Enumerates symbols in a loaded module.""" - results: list[dict] = [] - - def _cb(pSymInfo, size, ctx): - sym = pSymInfo.contents - # Extract unused but potentially useful fields for debugging - # name, addr, tag = sym.name, sym.Address, sym.tag - # type_name = self.type_info(base, sym.TypeIndex) - results.append(copy(sym)) - return True - - cb = PSYM_ENUMERATESYMBOLS_CALLBACK(_cb) - if not dbghelp.SymEnumSymbols(self.hproc, base, pattern, cb, None): - raise OSError(f"SymEnumSymbols failed, error={last_error()}") - return results - - def sym_from_addr(self, addr: int): - """Retrieves symbol information for the specified address.""" - displacement = ULONG64(0) - info = SYMBOL_INFO() - info.SizeOfStruct = ctypes.sizeof(SYMBOL_INFO) - info.MaxNameLen = MAX_SYM_NAME - if not dbghelp.SymFromAddr(self.hproc, ULONG64(addr), ctypes.byref(displacement), ctypes.byref(info)): - raise OSError(f"SymFromAddr failed, error={last_error()}") - return info.Name.decode(errors="ignore"), int(info.Address), int(displacement.value) - - def get_module_information(self, hmod: HANDLE) -> ModuleInfo: - """Gets module information for the given module handle.""" - modinfo = MODULEINFO() - if not psapi.GetModuleInformation(self.hproc, hmod, ctypes.byref(modinfo), ctypes.sizeof(modinfo)): - raise OSError(f"GetModuleInformation failed, error={last_error()}") - return ModuleInfo(modinfo.lpBaseOfDll, modinfo.SizeOfImage, modinfo.EntryPoint) - - -def pdb_symbols_for_pe(pe_path: str, symbol_path: str | None = None) -> list[dict]: - """Load PDB symbols for a PE file (high-level API). - - Convenience function that creates a PdbSession, loads the PE module, - enumerates all symbols, and returns a list compatible with Pe_Symbol. - - Args: - pe_path: Path to PE file (.exe, .dll, etc.) - symbol_path: Optional symbol search path string. - Supports semicolon-separated paths and symbol servers: - - "C:\\Symbols;D:\\Debug" - - "SRV*C:\\Cache*https://msdl.microsoft.com/download/symbols" - If None, searches current directory - - Returns: - List of symbol dictionaries with fields: - - name (str): Symbol name - - value (int): Symbol address (absolute VA) - - section_number (int): Always 0 for PDB symbols - - type (str): Type information if available - - storage_class (int): Always 0 for PDB symbols - - Example: - ```python - # Basic usage - symbols = pdb_symbols_for_pe("kernel32.dll") - for sym in symbols: - print(f"{sym['name']:40s} @ {sym['value']:#010x}") - - # With symbol server - symbols = pdb_symbols_for_pe( - "app.exe", - "SRV*C:\\SymCache*https://msdl.microsoft.com/download/symbols" - ) - - # Filter functions only - functions = [s for s in symbols if s.get('is_function', False)] - ``` - - Note: - - Returns empty list if PDB not found or on non-Windows platforms - - Symbol addresses are absolute (not RVAs) - - Errors are caught and logged, returning empty list - - For more control, use PdbSession directly - - Integration: - This function is called automatically by objutils.pecoff.PeParser - when COFF symbol table is empty. - """ - if not _WINDOWS: - return [] - - try: - with PdbSession(symbol_path if not symbol_path else [symbol_path]) as session: - mod_base = session.load_module(pe_path) - pdb_symbols = session.enum_symbols(mod_base, b"*") - result = [] - for sym in pdb_symbols: - if sym.tag != "SymTagData": - continue - ti = session.type_info(mod_base, sym.TypeIndex) - print(f"{sym.tag:15}", sym.Name, hex(sym.Address - mod_base), sym.Size , sym.decode_flags(), "==>", ti) - result.append(symbols.VariableType(sym.Name, ti, sym.Address - mod_base, sym.Size)) - return result - """ - Name: Symbol name (null-terminated char array) - Address: Absolute address in memory - ModBase: Module base address - Flags: Symbol flags (SymFlag enum values) - Tag: Symbol tag type (SymTagEnum values) - Size: Symbol size in bytes - Value: Symbol value (for constants) - - Helper Methods: - is_function(): True if symbol is a function - is_export(): True if symbol is exported - is_local(): True if symbol is local variable - is_parameter(): True if symbol is function parameter - decode_flags(): List of flag names - """ - - except (OSError, RuntimeError, ValueError) as e: - print(f"Error: {str(e)}") - return [] # Return an empty list in case of errors. - - -def main(pe_path: str): # pragma: no cover - debug helper - items = pdb_symbols_for_pe(pe_path) - for it in items[:50]: - print(f"{it['name']} : {it.get('type', 'unknown')} @ 0x{it['value']:016X}") +#!/usr/bin/env python + +"""PDB debug symbol integration for PE/COFF files (Windows only). + +This module provides access to Microsoft Program Database (PDB) debug information +using the Windows dbghelp.dll API. It enables comprehensive symbol lookup beyond +the typically stripped COFF symbol table in release binaries. + +**Platform Support**: Windows only (requires dbghelp.dll) + +Overview: + PDB files contain rich debug information: + + - **Symbols**: Function names, variables, constants + - **Types**: Structures, unions, enums, typedefs + - **Source Info**: File names, line numbers + - **Call Frames**: Stack unwinding data + + ``` + PE File + PDB: + ┌──────────────┐ ┌──────────────┐ + │ app.exe │────>│ app.pdb │ + │ │ │ │ + │ Code │ │ - Symbols │ + │ Data │ │ - Types │ + │ (stripped) │ │ - Lines │ + └──────────────┘ └──────────────┘ + ``` + +Architecture: + **Windows dbghelp.dll**: + - Microsoft's debug helper library + - Symbol server support + - Handles PDB loading and parsing + - Provides symbol enumeration API + + **Symbol Enumeration**: + 1. Initialize dbghelp session (SymInitialize) + 2. Load PE module (SymLoadModuleExW) + 3. Set symbol search paths + 4. Enumerate symbols (SymEnumSymbolsA with callback) + 5. Extract type information (optional) + 6. Cleanup (SymCleanup) + + **Type Information Extraction**: + - Uses dbghelp type info API (SymGetTypeInfo) + - Recursively resolves pointers, arrays, structs + - Extracts sizes, offsets, field names + +Usage Examples: + **Basic Symbol Extraction**: + ```python + from objutils.pecoff.pdb import pdb_symbols_for_pe + + # Load symbols from PDB + symbols = pdb_symbols_for_pe("app.exe") + + for sym in symbols: + print(f"{sym['name']:40s} @ {sym['address']:#010x}") + ``` + + **With Symbol Search Path**: + ```python + # Search multiple directories for PDB + symbols = pdb_symbols_for_pe( + "app.exe", + symbol_path="C:\\Symbols;SRV*C:\\SymCache*https://msdl.microsoft.com/download/symbols" + ) + ``` + + **Advanced Session Management**: + ```python + from objutils.pecoff.pdb import PdbSession + + with PdbSession("app.exe", symbol_path=[".", "C:\\Symbols"]) as pdb: + # Enumerate all symbols + for sym in pdb.enum_symbols(): + if sym.is_function(): + print(f"Function: {sym.name} @ {sym.Address:#x}") + + # Get module info + info = pdb.get_module_info() + print(f"Module base: {info.base_of_dll:#x}") + ``` + + **Type Information Extraction**: + ```python + from objutils.pecoff.pdb import CTypeInfoDump + + # Extract C type definitions + type_dumper = CTypeInfoDump(pdb_session.handle, base_address) + type_info = type_dumper.get_type_from_type_index(type_idx) + print(f"Type: {type_info['type_name']}, Size: {type_info['size']}") + ``` + +Key Components: + **Enums**: + - **SymTagEnum**: Symbol tag types (function, data, UDT, etc.) + - **BasicType**: Primitive types (int, float, void, etc.) + - **SymFlag**: Symbol flags (export, local, function, etc.) + - **IMAGEHLP_SYMBOL_TYPE_INFO**: Type info query constants + + **Data Classes**: + - **ModuleInfo**: Module metadata (base address, size, entry point) + - **SYMBOL_INFO**: Symbol information structure (ctypes) + - **MODULEINFO**: Windows API module info structure + + **Core Classes**: + - **CTypeInfoDump**: Type information extraction and resolution + - **PdbSession**: Manages dbghelp.dll lifetime and operations + +dbghelp.dll API: + The module wraps these key dbghelp functions: + + - **SymInitialize**: Initialize symbol handler + - **SymCleanup**: Cleanup symbol handler + - **SymLoadModuleExW**: Load module for symbol resolution + - **SymEnumSymbolsA**: Enumerate symbols with callback + - **SymGetTypeInfo**: Query type information + - **SymSetSearchPath/SymGetSearchPath**: Symbol path management + +Symbol Search Paths: + dbghelp supports flexible symbol search: + + - **Local paths**: "C:\\Symbols;D:\\Debug" + - **Symbol servers**: "SRV*C:\\Cache*https://msdl.microsoft.com/download/symbols" + - **Combined**: "C:\\Local;SRV*C:\\Cache*https://server" + + The `_NT_SYMBOL_PATH` environment variable is respected. + +Limitations: + - **Windows only**: Requires dbghelp.dll (unavailable on Linux/Mac) + - **PDB required**: Release binaries typically lack embedded COFF symbols + - **Architecture match**: PDB must match PE architecture (x86/x64) + - **Version match**: PDB should match PE build (GUID/age check) + - **Type info**: Complex recursive structures may have limitations + +Error Handling: + On non-Windows platforms, dbghelp/kernel32/psapi are set to None: + + ```python + from objutils.pecoff.pdb import _WINDOWS + + if not _WINDOWS: + print("PDB support unavailable (not Windows)") + ``` + + Import errors are caught and gracefully handled in __init__.py. + +See Also: + - objutils.pecoff: Main PE parser that uses this module + - objutils.pecoff.defs: PE/COFF constants + - objutils.elf.model: Similar ORM pattern for ELF + - Microsoft dbghelp.dll documentation + - PDB format specification + +Example Integration: + ```python + from objutils.pecoff import PeParser + + # PeParser automatically attempts PDB loading + pe = PeParser("kernel32.dll", pdb_path=["C:\\Symbols"]) + + # Symbols now include PDB data if found + for sym in pe.symbols: + print(f"{sym['name']}: {sym['value']:#x}") + ``` +""" + +import ctypes +import enum +from copy import copy +from ctypes import wintypes +from dataclasses import dataclass +from enum import IntEnum +from functools import lru_cache +from typing import Any, Optional + +# DLLs +try: + dbghelp = ctypes.WinDLL("dbghelp") # type: ignore[attr-defined] + kernel32 = ctypes.WinDLL("kernel32") # type: ignore[attr-defined] + psapi = ctypes.WinDLL("psapi") + _WINDOWS = True +except OSError: # pragma: no cover - non-Windows environment + dbghelp = None # type: ignore[assignment] + kernel32 = None # type: ignore[assignment] + psapi = None + _WINDOWS = False + +from objutils import symbols + + +@dataclass +class ModuleInfo: + """Module metadata extracted from Windows process. + + Attributes: + base_of_dll: Base address where module is loaded in memory + size_of_image: Size of module in memory (bytes) + entry_point: Address of module entry point (or None) + + Example: + ```python + info = pdb_session.get_module_info() + print(f"Module: {info.base_of_dll:#x} - {info.base_of_dll + info.size_of_image:#x}") + ``` + """ + + base_of_dll: int + size_of_image: int + entry_point: Optional[int] + + +# Types +HANDLE = wintypes.HANDLE +HLOCAL = wintypes.HANDLE +DWORD = wintypes.DWORD +ULONG = wintypes.ULONG +ULONG64 = ctypes.c_ulonglong +BOOL = wintypes.BOOL +LPVOID = wintypes.LPVOID +LPCWSTR = wintypes.LPCWSTR +LPCSTR = wintypes.LPCSTR + + +class VARTYPE(IntEnum): + """OLE Automation variant type discriminator (vt field of VARIANT).""" + + VT_EMPTY = 0 + VT_NULL = 1 + VT_I2 = 2 + VT_I4 = 3 + VT_R4 = 4 + VT_R8 = 5 + VT_BSTR = 8 + VT_BOOL = 11 + VT_I1 = 16 + VT_UI1 = 17 + VT_UI2 = 18 + VT_UI4 = 19 + VT_I8 = 20 + VT_UI8 = 21 + VT_INT = 22 + VT_UINT = 23 + + +class _VARIANT_VALUE(ctypes.Union): + """Inner value union of a COM VARIANT (covers numeric and pointer cases).""" + + _fields_ = [ + ("llVal", ctypes.c_longlong), + ("lVal", ctypes.c_long), + ("bVal", ctypes.c_ubyte), + ("iVal", ctypes.c_short), + ("fltVal", ctypes.c_float), + ("dblVal", ctypes.c_double), + ("boolVal", ctypes.c_short), + ("scode", ctypes.c_long), + ("cVal", ctypes.c_int8), + ("uiVal", ctypes.c_ushort), + ("ulVal", ctypes.c_ulong), + ("ullVal", ctypes.c_ulonglong), + ("intVal", ctypes.c_int), + ("uintVal", ctypes.c_uint), + ("byref", ctypes.c_void_p), + ] + + +class VARIANT(ctypes.Structure): + """Minimal ctypes representation of the OLE Automation VARIANT structure. + + The full COM VARIANT is a discriminated union keyed on the ``vt`` field. + Only the scalar numeric types that can appear as PDB constant values are + covered here; pointer/array/record sub-types are not needed. + + Total size is 16 bytes (matching the Windows ABI definition). + """ + + _fields_ = [ + ("vt", ctypes.c_ushort), + ("wReserved1", ctypes.c_ushort), + ("wReserved2", ctypes.c_ushort), + ("wReserved3", ctypes.c_ushort), + ("_value", _VARIANT_VALUE), + ] + + +def _variant_to_python(variant): + """Convert a VARIANT value to an appropriate Python primitive. + + Only the numeric VARTYPE values that are relevant for PDB constant symbols + are handled. Unknown or unsupported types return ``None``. + """ + try: + kind = VARTYPE(variant.vt) + except ValueError: + return None + v = variant._value + _map = { + VARTYPE.VT_I1: lambda: int(v.cVal), + VARTYPE.VT_I2: lambda: int(v.iVal), + VARTYPE.VT_I4: lambda: int(v.lVal), + VARTYPE.VT_I8: lambda: int(v.llVal), + VARTYPE.VT_UI1: lambda: int(v.bVal), + VARTYPE.VT_UI2: lambda: int(v.uiVal), + VARTYPE.VT_UI4: lambda: int(v.ulVal), + VARTYPE.VT_UI8: lambda: int(v.ullVal), + VARTYPE.VT_INT: lambda: int(v.intVal), + VARTYPE.VT_UINT: lambda: int(v.uintVal), + VARTYPE.VT_R4: lambda: float(v.fltVal), + VARTYPE.VT_R8: lambda: float(v.dblVal), + VARTYPE.VT_BOOL: lambda: bool(v.boolVal), + } + fn = _map.get(kind) + return fn() if fn is not None else None + + +class TI_FINDCHILDREN_PARAMS(ctypes.Structure): + _fields_ = [ + ("Count", ULONG), + ("Start", ULONG), + ("ChildId", ULONG * 1), + ] + + +# SYMBOL_INFO struct (ANSI) +MAX_SYM_NAME = 2000 + + +class SYMBOL_INFO(ctypes.Structure): + """Windows API structure for symbol information. + + Used with dbghelp.dll SymEnumSymbolsA to enumerate symbols. + Contains detailed information about a symbol including name, address, + flags, and type information. + + Key Attributes: + Name: Symbol name (null-terminated char array) + Address: Absolute address in memory + ModBase: Module base address + Flags: Symbol flags (SymFlag enum values) + Tag: Symbol tag type (SymTagEnum values) + Size: Symbol size in bytes + Value: Symbol value (for constants) + + Helper Methods: + is_function(): True if symbol is a function + is_export(): True if symbol is exported + is_local(): True if symbol is local variable + is_parameter(): True if symbol is function parameter + decode_flags(): List of flag names + + Properties: + name: Decoded symbol name (str) + tag: Symbol tag name (str) + rel_address: Relative address (Address - ModBase) + + Example: + ```python + # Used in enumeration callback + def callback(sym_info, size, context): + sym = ctypes.cast(sym_info, ctypes.POINTER(SYMBOL_INFO)).contents + if sym.is_function(): + print(f"Function: {sym.name} @ {sym.Address:#x}") + return True # Continue enumeration + ``` + """ + + def is_clr_token(self) -> bool: + """Check if symbol is a CLR token (.NET managed code).""" + return bool(self.Flags & SymFlag.SYMFLAG_CLR_TOKEN) + return bool(self.Flags & SymFlag.SYMFLAG_CLR_TOKEN) + + def is_constant(self) -> bool: + """Check if symbol is a constant value.""" + return bool(self.Flags & SymFlag.SYMFLAG_CONSTANT) + + def is_export(self) -> bool: + """Check if symbol is exported from module.""" + return bool(self.Flags & SymFlag.SYMFLAG_EXPORT) + + def is_forwarder(self) -> bool: + """Check if symbol is an export forwarder.""" + return bool(self.Flags & SymFlag.SYMFLAG_FORWARDER) + + def is_framerel(self) -> bool: + """Check if symbol is frame-relative (stack variable).""" + return bool(self.Flags & SymFlag.SYMFLAG_FRAMEREL) + + def is_function(self) -> bool: + """Check if symbol is a function.""" + return bool(self.Flags & SymFlag.SYMFLAG_FUNCTION) + + def is_ilrel(self) -> bool: + """Check if symbol is IL-relative (.NET managed code).""" + return bool(self.Flags & SymFlag.SYMFLAG_ILREL) + + def is_local(self) -> bool: + """Check if symbol is a local variable.""" + return bool(self.Flags & SymFlag.SYMFLAG_LOCAL) + + def is_metadata(self) -> bool: + """Check if symbol is metadata.""" + return bool(self.Flags & SymFlag.SYMFLAG_METADATA) + + def is_parameter(self) -> bool: + """Check if symbol is a function parameter.""" + return bool(self.Flags & SymFlag.SYMFLAG_PARAMETER) + + def is_register(self) -> bool: + """Check if symbol is in a register.""" + return bool(self.Flags & SymFlag.SYMFLAG_REGISTER) + + def is_regrel(self) -> bool: + """Check if symbol is register-relative.""" + return bool(self.Flags & SymFlag.SYMFLAG_REGREL) + + def is_slot(self) -> bool: + """Check if symbol is a slot (.NET managed code).""" + return bool(self.Flags & SymFlag.SYMFLAG_SLOT) + + def is_thunk(self) -> bool: + """Check if symbol is a thunk (jump stub).""" + return bool(self.Flags & SymFlag.SYMFLAG_THUNK) + + def is_tlsrel(self) -> bool: + """Check if symbol is thread-local storage relative.""" + return bool(self.Flags & SymFlag.SYMFLAG_TLSREL) + + def is_value_present(self) -> bool: + """Check if symbol has value field populated.""" + return bool(self.Flags & SymFlag.SYMFLAG_VALUEPRESENT) + + def is_virtual(self) -> bool: + """Check if symbol is virtual.""" + return bool(self.Flags & SymFlag.SYMFLAG_VIRTUAL) + + # @cached_property + def decode_flags(self) -> list[str]: + """Decode Flags field to list of flag names. + + Returns: + List of flag names (e.g., ["SYMFLAG_FUNCTION", "SYMFLAG_EXPORT"]) + """ + return [f.name for f in SymFlag if self.Flags & f.value] + + # @cached_property + @property + def name(self): + """Get symbol name as decoded string. + + Returns: + Symbol name (str), ignoring decode errors + """ + return self.Name.decode(errors="ignore") + + # @cached_property + @property + def tag(self): + """Get symbol tag name. + + Returns: + Tag name (e.g., "SymTagFunction"), or "SymTagNull" if invalid + """ + try: + return SymTagEnum(self.Tag).name + except ValueError: + return SymTagEnum.SymTagNull.name + + # @cached_property + @property + def rel_address(self): + """Get symbol address relative to module base. + + Returns: + Relative virtual address (RVA) + """ + return self.Address - (0 if self.ModBase is None else self.ModBase) + + def __repr__(self) -> str: + name = self.Name.decode(errors="ignore") + return f"" + + _fields_ = [ + ("SizeOfStruct", ULONG), + ("TypeIndex", ULONG), + ("Reserved", ULONG64 * 2), + ("Index", ULONG), + ("Size", ULONG), + ("ModBase", ULONG64), + ("Flags", ULONG), + ("Value", ULONG64), + ("Address", ULONG64), + ("Register", ULONG), + ("Scope", ULONG), + ("Tag", ULONG), + ("NameLen", ULONG), + ("MaxNameLen", ULONG), + ("Name", ctypes.c_char * (MAX_SYM_NAME + 1)), + ] + + +# MODULEINFO struct +class MODULEINFO(ctypes.Structure): + """Windows API structure for module information (from psapi.dll). + + Used with GetModuleInformation to query module metadata. + """ + + _fields_ = [ + ("lpBaseOfDll", LPVOID), + ("SizeOfImage", DWORD), + ("EntryPoint", LPVOID), + ] + + +class SymTagEnum(IntEnum): + """Symbol tag types for PDB symbols. + + Defines the kind of symbol (function, data, type, etc.). + Used in SYMBOL_INFO.Tag field. + + Common Values: + SymTagFunction (5): Function symbol + SymTagData (7): Variable symbol + SymTagPublicSymbol (10): Exported symbol + SymTagUDT (11): User-defined type (struct/class) + SymTagEnum (12): Enumeration type + SymTagPointerType (14): Pointer type + SymTagArrayType (15): Array type + SymTagBaseType (16): Primitive type + """ + + SymTagNull = 0 + SymTagExe = 1 + SymTagCompiland = 2 + SymTagCompilandDetails = 3 + SymTagCompilandEnv = 4 + SymTagFunction = 5 + SymTagBlock = 6 + SymTagData = 7 + SymTagAnnotation = 8 + SymTagLabel = 9 + SymTagPublicSymbol = 10 + SymTagUDT = 11 + SymTagEnum = 12 + SymTagFunctionType = 13 + SymTagPointerType = 14 + SymTagArrayType = 15 + SymTagBaseType = 16 + SymTagTypedef = 17 + SymTagBaseClass = 18 + SymTagFriend = 19 + SymTagFunctionArgType = 20 + SymTagFuncDebugStart = 21 + SymTagFuncDebugEnd = 22 + SymTagUsingNamespace = 23 + SymTagVTableShape = 24 + SymTagVTable = 25 + SymTagCustom = 26 + SymTagThunk = 27 + SymTagCustomType = 29 + SymTagManagedType = 30 + SymTagDimension = 31 + + +class UdtKind(enum.IntEnum): + UdtStruct = 0 + UdtClass = 1 + UdtUnion = 2 + + +class IMAGEHLP_SYMBOL_TYPE_INFO(IntEnum): + """Constants for SymGetTypeInfo queries. + + Used with dbghelp.SymGetTypeInfo to query type information. + + Common Queries: + TI_GET_SYMTAG (0): Get symbol tag + TI_GET_SYMNAME (1): Get symbol name + TI_GET_LENGTH (2): Get type size in bytes + TI_GET_TYPE (3): Get type index + TI_GET_BASETYPE (5): Get base type (BasicType enum) + TI_GET_CHILDRENCOUNT (13): Get count of child members + TI_GET_OFFSET (10): Get member offset in struct + """ + + TI_GET_SYMTAG = 0 + TI_GET_SYMNAME = 1 + TI_GET_LENGTH = 2 + TI_GET_TYPE = 3 + TI_GET_TYPEID = 4 + TI_GET_BASETYPE = 5 + TI_GET_ARRAYINDEXTYPEID = 6 + TI_FINDCHILDREN = 7 + TI_GET_DATAKIND = 8 + TI_GET_ADDRESSOFFSET = 9 + TI_GET_OFFSET = 10 + TI_GET_VALUE = 11 + TI_GET_COUNT = 12 + TI_GET_CHILDRENCOUNT = 13 + TI_GET_BITPOSITION = 14 + TI_GET_VIRTUALBASECLASS = 15 + TI_GET_VIRTUALTABLESHAPEID = 16 + TI_GET_VIRTUALBASEPOINTEROFFSET = 17 + TI_GET_CLASSTYPEID = 18 + TI_GET_NESTED = 19 + TI_GET_SYMINDEX = 20 + TI_GET_LEXICALPARENT = 21 + TI_GET_ADDRESS = 22 + TI_GET_THISADJUST = 23 + TI_GET_UDTKIND = 24 + TI_IS_EQUIV_TO = 25 + TI_GET_CALLING_CONVENTION = 26 + TI_IS_CLOSE_EQUIV_TO = 27 + TI_GTIEX_REQS_VALID = 28 + TI_GET_VIRTUALBASEOFFSET = 29 + TI_GET_VIRTUALBASEDISPINDEX = 30 + TI_GET_IS_REFERENCE = 31 + TI_GET_INDIRECTVIRTUALBASEDISPINDEX = 32 + TI_GET_VIRTUALBASETABLETYPEID = 33 + TI_GET_OBJECTPOINTERTYPEID = 34 + TI_GET_IS_CONST = 35 + TI_GET_IS_VOLATILE = 36 + TI_GET_IS_UNALIGNED = 37 + + +class BasicType(IntEnum): + """Primitive type identifiers for PDB types. + + Used with TI_GET_BASETYPE query to identify base types. + + Common Types: + btVoid (1): void type + btChar (2): char type + btInt (6): signed integer + btUInt (7): unsigned integer + btFloat (8): floating point + btBool (10): boolean + btLong (13): long integer + btULong (14): unsigned long + """ + + btNoType = 0 + btVoid = 1 + btChar = 2 + btWChar = 3 + btInt = 6 + btUInt = 7 + btFloat = 8 + btBCD = 9 + btBool = 10 + btLong = 13 + btULong = 14 + btCurrency = 25 + btDate = 26 + btVariant = 27 + btComplex = 28 + btBit = 29 + btBSTR = 30 + btHresult = 31 + btChar16 = 32 + btChar32 = 33 + btChar8 = 34 + + +PRIMITIVE_TYPEMAP = { + BasicType.btNoType: "", + BasicType.btVoid: "void", + BasicType.btChar: "char", + BasicType.btWChar: "wchar", + BasicType.btInt: "signed int", + BasicType.btUInt: "unsigned int", + BasicType.btFloat: "float", + BasicType.btBCD: "BCD", + BasicType.btBool: "bool", + BasicType.btLong: "long", + BasicType.btULong: "unsigned long", + BasicType.btCurrency: "CURRENCY", + BasicType.btDate: "DATE", + BasicType.btVariant: "VARIANT", + BasicType.btComplex: "complex", + BasicType.btBit: "BIT", + BasicType.btBSTR: "BSTR", + BasicType.btHresult: "HRESULT", + BasicType.btChar16: "Char16", + BasicType.btChar32: "Char32", + BasicType.btChar8: "Char8", +} + + +class DataKind(enum.IntEnum): + DataIsUnknown = 0 + DataIsLocal = 1 + DataIsStaticLocal = 2 + DataIsParam = 3 + DataIsObjectPtr = 4 + DataIsFileStatic = 5 + DataIsGlobal = 6 + DataIsMember = 7 + DataIsStaticMember = 8 + DataIsConstant = 9 + + +class SymFlag(IntEnum): + """Symbol flags for SYMBOL_INFO.Flags field. + + Bit flags indicating symbol properties. + + Common Flags: + SYMFLAG_FUNCTION (0x800): Symbol is a function + SYMFLAG_EXPORT (0x200): Symbol is exported + SYMFLAG_LOCAL (0x80): Symbol is local variable + SYMFLAG_PARAMETER (0x40): Symbol is function parameter + SYMFLAG_REGISTER (0x8): Symbol is in register + SYMFLAG_CONSTANT (0x100): Symbol is a constant + SYMFLAG_VALUEPRESENT (0x1): Value field is valid + """ + + SYMFLAG_VALUEPRESENT = 0x00000001 + SYMFLAG_REGISTER = 0x00000008 + SYMFLAG_REGREL = 0x00000010 + SYMFLAG_FRAMEREL = 0x00000020 + SYMFLAG_PARAMETER = 0x00000040 + SYMFLAG_LOCAL = 0x00000080 + SYMFLAG_CONSTANT = 0x00000100 + SYMFLAG_EXPORT = 0x00000200 + SYMFLAG_FORWARDER = 0x00000400 + SYMFLAG_FUNCTION = 0x00000800 + SYMFLAG_VIRTUAL = 0x00001000 + SYMFLAG_THUNK = 0x00002000 + SYMFLAG_TLSREL = 0x00004000 + SYMFLAG_SLOT = 0x00008000 + SYMFLAG_ILREL = 0x00010000 + SYMFLAG_METADATA = 0x00020000 + SYMFLAG_CLR_TOKEN = 0x00040000 + + +# Prototypes +if _WINDOWS: + dbghelp.SymInitialize.argtypes = [HANDLE, LPCWSTR, BOOL] + dbghelp.SymInitialize.restype = BOOL + + dbghelp.SymCleanup.argtypes = [HANDLE] + dbghelp.SymCleanup.restype = BOOL + + dbghelp.SymSetOptions.argtypes = [DWORD] + dbghelp.SymSetOptions.restype = DWORD + + dbghelp.SymGetOptions.argtypes = [] + dbghelp.SymGetOptions.restype = DWORD + + dbghelp.SymLoadModuleExW.argtypes = [HANDLE, HANDLE, LPCWSTR, LPCWSTR, ULONG64, DWORD, LPVOID, DWORD] + dbghelp.SymLoadModuleExW.restype = ULONG64 # returns base + + dbghelp.SymSetSearchPath.argtypes = [HANDLE, LPCSTR] + dbghelp.SymSetSearchPath.restype = BOOL + + dbghelp.SymGetSearchPath.argtypes = [HANDLE, ctypes.c_char_p, DWORD] + dbghelp.SymGetSearchPath.restype = BOOL + +# SymEnumSymbolsA callback and function +if _WINDOWS: + PSYM_ENUMERATESYMBOLS_CALLBACK = ctypes.WINFUNCTYPE( + BOOL, + ctypes.POINTER(SYMBOL_INFO), + ULONG, + LPVOID, + ) + +if _WINDOWS: + dbghelp.SymEnumSymbols.argtypes = [HANDLE, ULONG64, LPCSTR, PSYM_ENUMERATESYMBOLS_CALLBACK, LPVOID] + dbghelp.SymEnumSymbols.restype = BOOL + +# SymFromAddr +if _WINDOWS: + dbghelp.SymFromAddr.argtypes = [HANDLE, ULONG64, ctypes.POINTER(ULONG64), ctypes.POINTER(SYMBOL_INFO)] + dbghelp.SymFromAddr.restype = BOOL + + dbghelp.SymGetTypeInfo.argtypes = [HANDLE, ULONG64, ULONG, ctypes.c_int, LPVOID] + dbghelp.SymGetTypeInfo.restype = BOOL + +# Kernel32 helpers +if _WINDOWS: + kernel32.GetCurrentProcess.restype = HANDLE + psapi.GetModuleInformation.argtypes = [HANDLE, HANDLE, ctypes.POINTER(MODULEINFO), DWORD] + psapi.GetModuleInformation.restype = BOOL + kernel32.GetLastError.restype = DWORD + kernel32.LoadLibraryA.argtypes = [LPCSTR] + kernel32.LoadLibraryA.restype = HANDLE + kernel32.FreeLibrary.argtypes = [HANDLE] + kernel32.FreeLibrary.restype = BOOL + kernel32.LocalFree.argtypes = [HLOCAL] + kernel32.LocalFree.restype = HLOCAL + + +def last_error(): + if not _WINDOWS: + return 0 + return kernel32.GetLastError() + + +# SYMOPT flags (subset) +SYMOPT_DEFERRED_LOADS = 0x00000004 +SYMOPT_UNDNAME = 0x00000002 +SYMOPT_LOAD_LINES = 0x00000010 + + +def load_library(lib_path: str) -> HANDLE: + """Loads the specified module into the address space of the calling process.""" + if not _WINDOWS: + raise OSError("PDB support requires Windows (kernel32.dll)") + handle = kernel32.LoadLibraryA(lib_path.encode("ascii")) + if not handle: + raise OSError(f"LoadLibraryA failed for {lib_path}, error={last_error()}") + return handle + + +def free_library(hmod: HANDLE) -> None: + """Frees the loaded dynamic-link library (DLL) module.""" + if not _WINDOWS: + raise OSError("PDB support requires Windows (kernel32.dll)") + if not kernel32.FreeLibrary(hmod): + raise OSError(f"FreeLibrary failed, error={last_error()}") + + +class CTypeInfoDump: + """Extracts C type information from PDB debug symbols. + + Recursively resolves type definitions including pointers, arrays, + structures, unions, and base types. Uses dbghelp.SymGetTypeInfo + to query type metadata. + + Attributes: + process: dbghelp process handle + mod_base: Module base address + + Type Resolution Algorithm: + 1. Query type tag (pointer, array, UDT, base type, etc.) + 2. For compound types: + - Pointer: Resolve pointed-to type + - Array: Resolve element type and count + - UDT: Enumerate members recursively + 3. Calculate sizes and offsets + 4. Build type dictionary with metadata + + Example: + ```python + type_dumper = CTypeInfoDump(pdb_session.handle, base_address) + + # Get type info for a symbol + type_info = type_dumper.get_type_from_type_index(type_idx) + print(f"Type: {type_info['type_name']}") + print(f"Size: {type_info['size']} bytes") + + # For struct, enumerate members + if 'members' in type_info: + for member in type_info['members']: + print(f" {member['name']}: {member['type_name']} @ +{member['offset']}") + ``` + + Note: + Type resolution can be slow for complex recursive structures. + Use caching when querying multiple symbols. + """ + + def __init__(self, process, mod_base): + """Initialize type info dumper. + + Args: + process: dbghelp process handle from PdbSession + mod_base: Module base address + """ + self.process = process + self.mod_base = mod_base + self._type_cache: dict[int, symbols.TypeInfo] = {} + self._resolving: set[int] = set() + + def get_type_info(self, type_id, info_type): + """Query type information from dbghelp. + + Args: + type_id: Type index to query + info_type: IMAGEHLP_SYMBOL_TYPE_INFO constant + + Returns: + Type information value (type depends on info_type): + - String for TI_GET_SYMNAME + - Integer for TI_GET_LENGTH, TI_GET_COUNT, etc. + - Boolean for TI_GET_IS_CONST, TI_GET_IS_VOLATILE, etc. + - None if query fails + + Note: + Different info_type values return different data types. + Memory for strings (TI_GET_SYMNAME) is automatically freed. + """ + if not _WINDOWS: + return None + if info_type in (IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME,): + ptr = ctypes.c_void_p() + if dbghelp.SymGetTypeInfo(self.process, self.mod_base, type_id, info_type.value, ctypes.byref(ptr)): + if ptr.value: + name = ctypes.wstring_at(ptr) + kernel32.LocalFree(ptr) + return name + elif info_type in ( + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMTAG, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_BASETYPE, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_TYPE, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_TYPEID, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_ARRAYINDEXTYPEID, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_COUNT, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_CHILDRENCOUNT, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_OFFSET, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_DATAKIND, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_UDTKIND, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_ADDRESSOFFSET, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_BITPOSITION, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VIRTUALTABLESHAPEID, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VIRTUALBASEPOINTEROFFSET, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_CLASSTYPEID, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_NESTED, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMINDEX, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LEXICALPARENT, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_THISADJUST, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_IS_EQUIV_TO, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_CALLING_CONVENTION, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_IS_CLOSE_EQUIV_TO, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VIRTUALBASEOFFSET, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VIRTUALBASEDISPINDEX, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_OBJECTPOINTERTYPEID, + ): + out = DWORD() + if dbghelp.SymGetTypeInfo(self.process, self.mod_base, type_id, info_type.value, ctypes.byref(out)): + return out.value + elif info_type == IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VALUE: + out = VARIANT() + if dbghelp.SymGetTypeInfo(self.process, self.mod_base, type_id, info_type.value, ctypes.byref(out)): + return _variant_to_python(out) + elif info_type in ( + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_ADDRESS, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GTIEX_REQS_VALID, + ): + out = ULONG64() + if dbghelp.SymGetTypeInfo(self.process, self.mod_base, type_id, info_type.value, ctypes.byref(out)): + return out.value + elif info_type in ( + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_CONST, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_VOLATILE, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_UNALIGNED, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_REFERENCE, + IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VIRTUALBASECLASS, + ): + out = BOOL() + if dbghelp.SymGetTypeInfo(self.process, self.mod_base, type_id, info_type.value, ctypes.byref(out)): + return bool(out.value) + elif info_type == IMAGEHLP_SYMBOL_TYPE_INFO.TI_FINDCHILDREN: + count = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_CHILDRENCOUNT) + if not count: + return [] + + # TI_FINDCHILDREN_PARAMS is a variable-sized structure. + # We need to allocate enough space for Count, Start, and all ChildIds. + size = ctypes.sizeof(TI_FINDCHILDREN_PARAMS) + (count - 1) * ctypes.sizeof(ULONG) + buf = (ctypes.c_char * size)() + params = ctypes.cast(buf, ctypes.POINTER(TI_FINDCHILDREN_PARAMS)) + params.contents.Count = count + params.contents.Start = 0 + + if dbghelp.SymGetTypeInfo(self.process, self.mod_base, type_id, info_type.value, params): + # Access ChildId as an array of length 'count' + child_ids = ctypes.cast(params.contents.ChildId, ctypes.POINTER(ULONG * count)) + return list(child_ids.contents) + return None + + def _get_referenced_type_id(self, type_id: int) -> int | None: + child_id = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_TYPEID) + if child_id is None: + child_id = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_TYPE) + return child_id + + def _wrap_qualifiers(self, type_id: int, tp: symbols.TypeInfo) -> symbols.TypeInfo: + if self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_CONST): + tp = symbols.ConstantType(tp) + if self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_VOLATILE): + tp = symbols.VolatileType(tp) + return tp + + def get_data(self, type_id: int) -> symbols.DataType | None: + tag_val = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMTAG) + if tag_val is None: + return None + tag = SymTagEnum(tag_val) + if tag != SymTagEnum.SymTagData: + return None + name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME) + tp = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_TYPE) + base_type = self.get_full_type_name(tp) if tp is not None else symbols.UnspecifiedType("unknown") + data_kind_value = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_DATAKIND) + try: + data_kind = DataKind(data_kind_value) + except (TypeError, ValueError): + data_kind = DataKind.DataIsUnknown + + value: Any = None + if data_kind == DataKind.DataIsConstant: + value = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VALUE) + elif data_kind in ( + DataKind.DataIsGlobal, + DataKind.DataIsStaticLocal, + DataKind.DataIsFileStatic, + DataKind.DataIsStaticMember, + ): + value = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_ADDRESS) + elif data_kind in (DataKind.DataIsLocal, DataKind.DataIsParam, DataKind.DataIsObjectPtr, DataKind.DataIsMember): + value = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_OFFSET) + return symbols.DataType(name or "", value, base_type, data_kind) + + def get_enumerators(self, type_id: int) -> list[symbols.Enumerator]: + tag_val = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMTAG) + if tag_val is None: + return [] + tag = SymTagEnum(tag_val) + if tag != SymTagEnum.SymTagEnum: + return [] + chs = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_FINDCHILDREN) + if not chs: + return [] + enumerators: list[symbols.Enumerator] = [] + for ch in chs: + result = self.get_data(ch) + if result is None: + continue + if isinstance(result.value, int): + enumerators.append(symbols.Enumerator(result.name, result.value)) + return enumerators + + def get_struct(self, type_id: int) -> symbols.StructureType: + name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME) + byte_size = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH) + chs = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_FINDCHILDREN) + members: list[symbols.StructMember] = [] + for ch in chs or []: + result = self.get_data(ch) + if result is None: + continue + offset = result.value if isinstance(result.value, int) else 0 + members.append(symbols.StructMember(result.name, result.type, offset)) + return symbols.StructureType(name or "", int(byte_size or 0), members) + + def get_union(self, type_id: int) -> symbols.UnionType: + name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME) + byte_size = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH) + chs = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_FINDCHILDREN) + alternatives: list[symbols.StructMember] = [] + for ch in chs or []: + result = self.get_data(ch) + if result is None: + continue + alternatives.append(symbols.StructMember(result.name, result.type, 0)) + return symbols.UnionType(name or "", int(byte_size or 0), alternatives) + + def get_class(self, type_id: int) -> symbols.ClassType: + name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME) + byte_size = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH) + chs = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_FINDCHILDREN) + members: list[symbols.ClassMember] = [] + for ch in chs or []: + result = self.get_data(ch) + if result is None: + continue + offset = result.value if isinstance(result.value, int) else 0 + members.append( + symbols.ClassMember( + result.name, + "", + result.type, + offset, + 0, + result.datakind == DataKind.DataIsStaticMember, + ) + ) + return symbols.ClassType(name or "", int(byte_size or 0), members) + + def get_args(self, type_id: int) -> symbols.TypeInfo | None: + tag_val = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMTAG) + if tag_val is None: + return None + tag = SymTagEnum(tag_val) + if tag != SymTagEnum.SymTagFunctionArgType: + return None + tp = self._get_referenced_type_id(type_id) + if tp is None: + return symbols.UnspecifiedType("unknown") + return self.get_full_type_name(tp) + + def get_function(self, type_id: int) -> symbols.SubroutineType: + name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME) + chs = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_FINDCHILDREN) + tid = self._get_referenced_type_id(type_id) + ret_type: symbols.TypeInfo = symbols.UnspecifiedType("void") + if tid is not None: + ret_type = self.get_full_type_name(tid) + args: list[symbols.TypeInfo] = [] + for ch in chs or []: + arg = self.get_args(ch) + if arg is not None: + args.append(arg) + return symbols.SubroutineType(name or "", 0, ret_type, args) + + def get_full_type_name(self, type_id: int | None) -> symbols.TypeInfo: + if type_id is None: + return symbols.UnspecifiedType("unknown") + if type_id in self._type_cache: + return self._type_cache[type_id] + if type_id in self._resolving: + return symbols.UnspecifiedType(f"recursive_type_{type_id}") + + self._resolving.add(type_id) + try: + resolved = self._resolve_type(type_id) + self._type_cache[type_id] = resolved + return resolved + finally: + self._resolving.discard(type_id) + + def _resolve_type(self, type_id: int) -> symbols.TypeInfo: + tag_val = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMTAG) + if tag_val is None: + return symbols.UnspecifiedType("unknown") + + try: + tag = SymTagEnum(tag_val) + except ValueError: + return symbols.UnspecifiedType(f"unknown_tag_{tag_val}") + + if tag == SymTagEnum.SymTagBaseType: + bt = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_BASETYPE) + length = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH) + if bt is not None: + try: + base_type = BasicType(bt) + except ValueError: + base_type = bt + type_name = PRIMITIVE_TYPEMAP.get(base_type, f"base_{bt}") if isinstance(base_type, BasicType) else f"base_{bt}" + enc = symbols.type_encoding_from_pdb_bt(int(base_type)) + resolved = symbols.PrimitiveType(type_name, enc, int(length or 0)) + return self._wrap_qualifiers(type_id, resolved) + enc = symbols.type_encoding_from_pdb_bt(int(BasicType.btVoid)) + return self._wrap_qualifiers(type_id, symbols.PrimitiveType("void", enc, int(length or 0))) + + elif tag == SymTagEnum.SymTagPointerType: + child_id = self._get_referenced_type_id(type_id) + is_ref = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_REFERENCE) + full_type = self.get_full_type_name(child_id) + if is_ref: + resolved = symbols.ReferenceType(full_type) + else: + resolved = symbols.PointerType(full_type) + return self._wrap_qualifiers(type_id, resolved) + + elif tag == SymTagEnum.SymTagArrayType: + child_id = self._get_referenced_type_id(type_id) + count = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_COUNT) + full_type = self.get_full_type_name(child_id) + byte_size = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH) + if not count and byte_size and isinstance(full_type, symbols.PrimitiveType) and full_type.byte_size: + count = int(byte_size // full_type.byte_size) + if count is not None: + resolved = symbols.ArrayType(full_type, [(0, int(count))]) + else: + resolved = symbols.ArrayType(full_type, [(0, 0)]) + if isinstance(full_type, symbols.ArrayType): + full_type.array_spec.insert(0, (0, count)) # coerce array-specifiers. + return full_type + else: + return self._wrap_qualifiers(type_id, resolved) + elif tag == SymTagEnum.SymTagEnum: + name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME) + tp = self._get_referenced_type_id(type_id) + base_type = self.get_full_type_name(tp) + byte_size = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH) + enumerators = self.get_enumerators(type_id) + encoding = base_type.encoding if isinstance(base_type, symbols.PrimitiveType) else None + resolved = symbols.EnumerationType(name or "", int(byte_size or 0), encoding, base_type, enumerators) + return self._wrap_qualifiers(type_id, resolved) + elif tag == SymTagEnum.SymTagTypedef: + name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME) + tp = self._get_referenced_type_id(type_id) + base_type = self.get_full_type_name(tp) + resolved = symbols.TypeDefiniton(name or "", base_type) + return self._wrap_qualifiers(type_id, resolved) + elif tag == SymTagEnum.SymTagUDT: + udt_kind = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_UDTKIND) + if udt_kind == UdtKind.UdtStruct: + resolved = self.get_struct(type_id) + elif udt_kind == UdtKind.UdtUnion: + resolved = self.get_union(type_id) + elif udt_kind == UdtKind.UdtClass: + resolved = self.get_class(type_id) + else: + name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME) + resolved = symbols.UnspecifiedType(name or "udt") + return self._wrap_qualifiers(type_id, resolved) + elif tag == SymTagEnum.SymTagFunctionType: + return self._wrap_qualifiers(type_id, self.get_function(type_id)) + else: + return symbols.UnspecifiedType(tag.name.lstrip("SymTag")) + + +class PdbSession: + """Manages dbghelp.dll symbol session lifecycle. + + Context manager for dbghelp symbol operations. Handles initialization, + module loading, symbol enumeration, and cleanup. + + The session maintains a dbghelp process handle and configures symbol + search paths. Automatically loads modules and enables symbol options. + + Attributes: + hproc: Process handle (from GetCurrentProcess) + _modules: Dictionary of loaded module bases by path + + Usage: + ```python + # Basic session + with PdbSession(symbol_path=[".", "C:\\Symbols"]) as pdb: + # Enumerate symbols + for sym in pdb.enum_symbols(): + print(f"{sym.name}: {sym.Address:#x}") + + # Load specific module + pdb = PdbSession() + try: + base = pdb.load_module("app.exe") + info = pdb.get_module_info() + print(f"Module loaded at {base:#x}, size {info.size_of_image} bytes") + finally: + pdb.close() + ``` + + Symbol Options: + The session automatically enables: + - SYMOPT_DEFERRED_LOADS: Load symbols on demand + - SYMOPT_UNDNAME: Undecorate C++ symbols + - SYMOPT_LOAD_LINES: Load source line information + + Note: + Always use context manager (with statement) or manually call close() + to ensure proper cleanup of dbghelp resources. + """ + + def __init__(self, symbol_path: list[str] | None = None): + """Initialize dbghelp symbol session. + + Args: + symbol_path: Optional list of directories to search for symbols. + Supports local paths and symbol servers: + - Local: ["C:\\Symbols", "D:\\Debug"] + - Server: ["SRV*C:\\Cache*https://msdl.microsoft.com/download/symbols"] + If None, uses current directory and _NT_SYMBOL_PATH + + Raises: + OSError: If not on Windows or SymInitialize fails + """ + if not _WINDOWS: + raise OSError("PDB support requires Windows (dbghelp.dll)") + + self.hproc = kernel32.GetCurrentProcess() + if symbol_path: + symbol_path_str = ";".join(symbol_path) + else: + symbol_path_str = None + + if not dbghelp.SymInitialize(self.hproc, symbol_path_str, True): + raise OSError(f"SymInitialize failed, error={last_error()}") + + opts = dbghelp.SymGetOptions() + opts |= SYMOPT_DEFERRED_LOADS | SYMOPT_UNDNAME | SYMOPT_LOAD_LINES + dbghelp.SymSetOptions(opts) + self.type_dumper_cache = {} + + @lru_cache + def type_info(self, base: int, type_index: int) -> symbols.TypeInfo: + if type_index: + if base in self.type_dumper_cache: + type_dumper = self.type_dumper_cache[base] + else: + type_dumper = CTypeInfoDump(self.hproc, base) + self.type_dumper_cache[base] = type_dumper + return type_dumper.get_full_type_name(type_index) + return symbols.UnspecifiedType("unknown") + + def cleanup(self) -> None: + """Cleans up the dbghelp session.""" + if _WINDOWS and hasattr(self, "hproc"): + dbghelp.SymCleanup(self.hproc) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.cleanup() + + def set_search_path(self, search_path: str) -> None: + """Sets the symbol search path for the current session.""" + if not dbghelp.SymSetSearchPath(self.hproc, search_path.encode("ascii")): + raise OSError(f"SymSetSearchPath failed, error={last_error()}") + + def get_search_path(self) -> str: + """Gets the symbol search path for the current session.""" + buffer = ctypes.create_string_buffer(2048) + if not dbghelp.SymGetSearchPath(self.hproc, buffer, ctypes.sizeof(buffer)): + raise OSError(f"SymGetSearchPath failed, error={last_error()}") + return buffer.value.decode("ascii") + + def load_module(self, file_path: str) -> int: + """Loads a module for the current session.""" + file_path = str(file_path) + base = dbghelp.SymLoadModuleExW(self.hproc, None, file_path, None, 0, 0, None, 0) + if base == 0: + raise OSError(f"SymLoadModuleExW failed for {file_path}, error={last_error()}") + return base + + def enum_symbols(self, base: int, pattern: bytes = b"*") -> list[dict]: # Generator[]: + """Enumerates symbols in a loaded module.""" + results: list[dict] = [] + + def _cb(pSymInfo, size, ctx): + sym = pSymInfo.contents + # Extract unused but potentially useful fields for debugging + # name, addr, tag = sym.name, sym.Address, sym.tag + # type_name = self.type_info(base, sym.TypeIndex) + results.append(copy(sym)) + return True + + cb = PSYM_ENUMERATESYMBOLS_CALLBACK(_cb) + if not dbghelp.SymEnumSymbols(self.hproc, base, pattern, cb, None): + raise OSError(f"SymEnumSymbols failed, error={last_error()}") + return results + + def sym_from_addr(self, addr: int): + """Retrieves symbol information for the specified address.""" + displacement = ULONG64(0) + info = SYMBOL_INFO() + info.SizeOfStruct = ctypes.sizeof(SYMBOL_INFO) + info.MaxNameLen = MAX_SYM_NAME + if not dbghelp.SymFromAddr(self.hproc, ULONG64(addr), ctypes.byref(displacement), ctypes.byref(info)): + raise OSError(f"SymFromAddr failed, error={last_error()}") + return info.Name.decode(errors="ignore"), int(info.Address), int(displacement.value) + + def get_module_information(self, hmod: HANDLE) -> ModuleInfo: + """Gets module information for the given module handle.""" + modinfo = MODULEINFO() + if not psapi.GetModuleInformation(self.hproc, hmod, ctypes.byref(modinfo), ctypes.sizeof(modinfo)): + raise OSError(f"GetModuleInformation failed, error={last_error()}") + return ModuleInfo(modinfo.lpBaseOfDll, modinfo.SizeOfImage, modinfo.EntryPoint) + + +def pdb_symbols_for_pe(pe_path: str, symbol_path: str | None = None) -> list[dict]: + """Load PDB symbols for a PE file (high-level API). + + Convenience function that creates a PdbSession, loads the PE module, + enumerates all symbols, and returns a list compatible with Pe_Symbol. + + Args: + pe_path: Path to PE file (.exe, .dll, etc.) + symbol_path: Optional symbol search path string. + Supports semicolon-separated paths and symbol servers: + - "C:\\Symbols;D:\\Debug" + - "SRV*C:\\Cache*https://msdl.microsoft.com/download/symbols" + If None, searches current directory + + Returns: + List of symbol dictionaries with fields: + - name (str): Symbol name + - value (int): Symbol address (absolute VA) + - section_number (int): Always 0 for PDB symbols + - type (str): Type information if available + - storage_class (int): Always 0 for PDB symbols + + Example: + ```python + # Basic usage + symbols = pdb_symbols_for_pe("kernel32.dll") + for sym in symbols: + print(f"{sym['name']:40s} @ {sym['value']:#010x}") + + # With symbol server + symbols = pdb_symbols_for_pe( + "app.exe", + "SRV*C:\\SymCache*https://msdl.microsoft.com/download/symbols" + ) + + # Filter functions only + functions = [s for s in symbols if s.get('is_function', False)] + ``` + + Note: + - Returns empty list if PDB not found or on non-Windows platforms + - Symbol addresses are absolute (not RVAs) + - Errors are caught and logged, returning empty list + - For more control, use PdbSession directly + + Integration: + This function is called automatically by objutils.pecoff.PeParser + when COFF symbol table is empty. + """ + if not _WINDOWS: + return [] + + try: + with PdbSession(symbol_path if not symbol_path else [symbol_path]) as session: + mod_base = session.load_module(pe_path) + pdb_symbols = session.enum_symbols(mod_base, b"*") + result = [] + for sym in pdb_symbols: + if sym.tag != "SymTagData": + continue + ti = session.type_info(mod_base, sym.TypeIndex) + print(f"{sym.tag:15}", sym.Name, hex(sym.Address - mod_base), sym.Size, sym.decode_flags(), "==>", ti) + result.append(symbols.VariableType(sym.Name, ti, sym.Address - mod_base, sym.Size)) + return result + """ + Name: Symbol name (null-terminated char array) + Address: Absolute address in memory + ModBase: Module base address + Flags: Symbol flags (SymFlag enum values) + Tag: Symbol tag type (SymTagEnum values) + Size: Symbol size in bytes + Value: Symbol value (for constants) + + Helper Methods: + is_function(): True if symbol is a function + is_export(): True if symbol is exported + is_local(): True if symbol is local variable + is_parameter(): True if symbol is function parameter + decode_flags(): List of flag names + """ + + except (OSError, RuntimeError, ValueError) as e: + print(f"Error: {str(e)}") + return [] # Return an empty list in case of errors. + + +def main(pe_path: str): # pragma: no cover - debug helper + items = pdb_symbols_for_pe(pe_path) + for it in items[:50]: + print(f"{it['name']} : {it.get('type', 'unknown')} @ 0x{it['value']:016X}") diff --git a/objutils/scripts/oj_coff_syms.py b/objutils/scripts/oj_coff_syms.py index d79bb2e..c289ff7 100644 --- a/objutils/scripts/oj_coff_syms.py +++ b/objutils/scripts/oj_coff_syms.py @@ -5,7 +5,7 @@ import argparse -from objutils.pecoff import PeParser, SymbolAPI +from objutils.pecoff import PeParser def main(argv: list[str] | None = None) -> int: @@ -39,7 +39,7 @@ def main(argv: list[str] | None = None) -> int: syms = pp.symbols # Fallback: if SymbolAPI attr is not present (static type), use direct list - #if not syms and pp.symbols: + # if not syms and pp.symbols: # syms = [type("_S", (), s) for s in pp.symbols] # quick adapter for printing # syms is a list of model.Pe_Symbol; order by value already in fetch() diff --git a/objutils/symbols.py b/objutils/symbols.py index 13b5f0d..5e498bb 100644 --- a/objutils/symbols.py +++ b/objutils/symbols.py @@ -1,442 +1,440 @@ -"""General symbol abstraction that works on top of DWARF, PDB, or whatsoever. - -This module provides a format-neutral type system that can represent type information -from DWARF (ELF debug info), PDB (Windows debug info), or any other debug format. - -The central abstraction for primitive types is :class:`TypeEncoding`, which replaces -the previous format-specific ``encoding: Any`` fields. Two helper functions translate -format-specific values to :class:`TypeEncoding`: - -- :func:`type_encoding_from_dwarf_ate` – converts a DWARF ``DW_ATE_*`` integer value -- :func:`type_encoding_from_pdb_bt` – converts a PDB ``BasicType`` (``btXxx``) integer value -""" - -from __future__ import annotations - -import enum -from dataclasses import dataclass, field -from typing import Any, TypeAlias - - -# --------------------------------------------------------------------------- -# Format-neutral type encoding -# --------------------------------------------------------------------------- - - -class TypeKind(enum.Enum): - """Fundamental type category, independent of any debug-format specifics. - - Groups: - Basic: VOID, BOOLEAN, ADDRESS, INTEGER, FLOAT - Float ext.: COMPLEX_FLOAT, IMAGINARY_FLOAT, DECIMAL_FLOAT - Characters: CHAR (repertoire stored in :class:`CharEncoding`) - Scaled: FIXED, PACKED_DECIMAL, NUMERIC_STRING, EDITED - Windows/COM: BCD, BIT, CURRENCY, DATE, VARIANT, HRESULT, BSTR - Fallback: UNKNOWN - """ - - VOID = "void" - BOOLEAN = "boolean" - ADDRESS = "address" - INTEGER = "integer" - FLOAT = "float" - COMPLEX_FLOAT = "complex_float" - IMAGINARY_FLOAT = "imaginary_float" - DECIMAL_FLOAT = "decimal_float" - CHAR = "char" - FIXED = "fixed" - PACKED_DECIMAL = "packed_decimal" - NUMERIC_STRING = "numeric_string" - EDITED = "edited" - BCD = "bcd" - BIT = "bit" - CURRENCY = "currency" - DATE = "date" - VARIANT = "variant" - HRESULT = "hresult" - BSTR = "bstr" - UNKNOWN = "unknown" - - -class Signedness(enum.Enum): - """Sign property of a type. - - Attributes: - SIGNED: Explicitly signed (e.g. ``int``, ``signed char``). - UNSIGNED: Explicitly unsigned (e.g. ``unsigned int``, ``uint8_t``). - NOT_APPLICABLE: Signedness is semantically meaningless for this kind - (float, void, bool, unicode char, address, …). - UNSPECIFIED: Signedness is theoretically applicable but not yet - determined – e.g. plain ``char`` whose signedness is - implementation-defined in C. - """ - - SIGNED = "signed" - UNSIGNED = "unsigned" - NOT_APPLICABLE = "n/a" - UNSPECIFIED = "unspecified" - - -class CharEncoding(enum.Enum): - """Character repertoire / encoding, only meaningful when :attr:`TypeKind` is ``CHAR``. - - Attributes: - UNSPECIFIED: Byte-sized ``char`` without a specified encoding (C ``char``). - ASCII: ISO/IEC 646:1991 – Fortran ``ASCII`` kind; DWARF ``DW_ATE_ASCII``. - UCS: ISO/IEC 10646 UCS-4 – Fortran ``ISO_10646`` kind; DWARF ``DW_ATE_UCS``. - UTF: ISO/IEC 10646-1:1993 (general Unicode); DWARF ``DW_ATE_UTF``. - UTF8: UTF-8 – C23/C++20 ``char8_t`` (unsigned); PDB ``btChar8``. - UTF16: UTF-16 – C++11 ``char16_t``; PDB ``btChar16``. - UTF32: UTF-32 – C++11 ``char32_t``; PDB ``btChar32``. - WIDE: Platform-defined wide character ``wchar_t``; PDB ``btWChar``. - """ - - UNSPECIFIED = "unspecified" - ASCII = "ascii" - UCS = "ucs" - UTF = "utf" - UTF8 = "utf8" - UTF16 = "utf16" - UTF32 = "utf32" - WIDE = "wide" - - -@dataclass(frozen=True) -class TypeEncoding: - """Format-neutral encoding descriptor for a primitive type. - - Combines a :class:`TypeKind` with optional :class:`Signedness` and - :class:`CharEncoding` qualifiers. The dataclass is *frozen* so instances - are hashable and can be used as dict or set keys. - - Args: - kind: Fundamental category of the type. - signedness: Sign property (defaults to :attr:`Signedness.NOT_APPLICABLE`). - char_encoding: Character repertoire for ``CHAR`` types - (defaults to :attr:`CharEncoding.UNSPECIFIED`). - - Examples:: - - >>> TypeEncoding(TypeKind.INTEGER, Signedness.SIGNED) - TypeEncoding(kind=INTEGER, signedness=SIGNED, char_encoding=UNSPECIFIED) - >>> TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.UTF16) - TypeEncoding(kind=CHAR, signedness=NOT_APPLICABLE, char_encoding=UTF16) - - Use :func:`type_encoding_from_dwarf_ate` and :func:`type_encoding_from_pdb_bt` - to construct instances from format-specific values. - """ - - kind: TypeKind - signedness: Signedness = Signedness.NOT_APPLICABLE - char_encoding: CharEncoding = CharEncoding.UNSPECIFIED - - # ------------------------------------------------------------------ - # Convenience predicates - # ------------------------------------------------------------------ - - def is_signed(self) -> bool: - """Return ``True`` if the type is explicitly signed.""" - return self.signedness == Signedness.SIGNED - - def is_unsigned(self) -> bool: - """Return ``True`` if the type is explicitly unsigned.""" - return self.signedness == Signedness.UNSIGNED - - def is_integer(self) -> bool: - """Return ``True`` for integer kinds (signed or unsigned).""" - return self.kind == TypeKind.INTEGER - - def is_float(self) -> bool: - """Return ``True`` for any floating-point kind.""" - return self.kind in (TypeKind.FLOAT, TypeKind.COMPLEX_FLOAT, TypeKind.IMAGINARY_FLOAT, TypeKind.DECIMAL_FLOAT) - - def is_char(self) -> bool: - """Return ``True`` for character types.""" - return self.kind == TypeKind.CHAR - - def is_void(self) -> bool: - """Return ``True`` for the void type.""" - return self.kind == TypeKind.VOID - - def is_boolean(self) -> bool: - """Return ``True`` for boolean types.""" - return self.kind == TypeKind.BOOLEAN - - def __str__(self) -> str: - parts: list[str] = [] - if self.signedness not in (Signedness.NOT_APPLICABLE, Signedness.UNSPECIFIED): - parts.append(self.signedness.value) - parts.append(self.kind.value) - if self.char_encoding != CharEncoding.UNSPECIFIED: - parts.append(f"({self.char_encoding.value})") - return " ".join(parts) - - def __repr__(self) -> str: - return ( - f"TypeEncoding(kind={self.kind.name}, signedness={self.signedness.name}, " - f"char_encoding={self.char_encoding.name})" - ) - - -# --------------------------------------------------------------------------- -# Conversion: DWARF DW_ATE_* → TypeEncoding -# --------------------------------------------------------------------------- - -# Raw integer keys are the DW_ATE_* values from DWARF4 Table 5.1. -# HP vendor extensions (0x80–0x8B) are not listed here; they map to UNKNOWN. -_DWARF_ATE_MAP: dict[int, TypeEncoding] = { - 0x0: TypeEncoding(TypeKind.VOID), # void (compiler extension, not in spec) - 0x1: TypeEncoding(TypeKind.ADDRESS, Signedness.NOT_APPLICABLE), # DW_ATE_address - 0x2: TypeEncoding(TypeKind.BOOLEAN, Signedness.NOT_APPLICABLE), # DW_ATE_boolean - 0x3: TypeEncoding(TypeKind.COMPLEX_FLOAT, Signedness.NOT_APPLICABLE), # DW_ATE_complex_float - 0x4: TypeEncoding(TypeKind.FLOAT, Signedness.NOT_APPLICABLE), # DW_ATE_float - 0x5: TypeEncoding(TypeKind.INTEGER, Signedness.SIGNED), # DW_ATE_signed - 0x6: TypeEncoding(TypeKind.CHAR, Signedness.SIGNED, CharEncoding.UNSPECIFIED), # DW_ATE_signed_char - 0x7: TypeEncoding(TypeKind.INTEGER, Signedness.UNSIGNED), # DW_ATE_unsigned - 0x8: TypeEncoding(TypeKind.CHAR, Signedness.UNSIGNED, CharEncoding.UNSPECIFIED), # DW_ATE_unsigned_char - 0x9: TypeEncoding(TypeKind.IMAGINARY_FLOAT, Signedness.NOT_APPLICABLE), # DW_ATE_imaginary_float - 0xA: TypeEncoding(TypeKind.PACKED_DECIMAL), # DW_ATE_packed_decimal - 0xB: TypeEncoding(TypeKind.NUMERIC_STRING), # DW_ATE_numeric_string - 0xC: TypeEncoding(TypeKind.EDITED), # DW_ATE_edited - 0xD: TypeEncoding(TypeKind.FIXED, Signedness.SIGNED), # DW_ATE_signed_fixed - 0xE: TypeEncoding(TypeKind.FIXED, Signedness.UNSIGNED), # DW_ATE_unsigned_fixed - 0xF: TypeEncoding(TypeKind.DECIMAL_FLOAT, Signedness.NOT_APPLICABLE), # DW_ATE_decimal_float - 0x10: TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.UTF), # DW_ATE_UTF (char16_t / char32_t / u8 in C++) - 0x11: TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.UCS), # DW_ATE_UCS (Fortran ISO_10646) - 0x12: TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.ASCII), # DW_ATE_ASCII (Fortran ASCII kind) -} - - -def type_encoding_from_dwarf_ate(ate_value: int) -> TypeEncoding: - """Convert a DWARF ``DW_ATE_*`` integer value to a :class:`TypeEncoding`. - - Args: - ate_value: Raw ``DW_AT_encoding`` value (e.g. ``BaseTypeEncoding.signed`` = 5). - - Returns: - Matching :class:`TypeEncoding`, or ``TypeEncoding(TypeKind.UNKNOWN)`` for - unrecognised or vendor-extension values. - - Example:: - - >>> from objutils.dwarf.constants import BaseTypeEncoding - >>> type_encoding_from_dwarf_ate(int(BaseTypeEncoding.float)) - TypeEncoding(kind=FLOAT, signedness=NOT_APPLICABLE, char_encoding=UNSPECIFIED) - """ - return _DWARF_ATE_MAP.get(int(ate_value), TypeEncoding(TypeKind.UNKNOWN)) - - -# --------------------------------------------------------------------------- -# Conversion: PDB BasicType → TypeEncoding -# --------------------------------------------------------------------------- - -# Raw integer keys are the btXxx values from Microsoft cvconst.h. -_PDB_BT_MAP: dict[int, TypeEncoding] = { - 0: TypeEncoding(TypeKind.UNKNOWN), # btNoType - 1: TypeEncoding(TypeKind.VOID), # btVoid - 2: TypeEncoding(TypeKind.CHAR, Signedness.UNSPECIFIED, CharEncoding.ASCII), # btChar – plain C char (impl-defined signedness) - 3: TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.WIDE), # btWChar – wchar_t - 6: TypeEncoding(TypeKind.INTEGER, Signedness.SIGNED), # btInt - 7: TypeEncoding(TypeKind.INTEGER, Signedness.UNSIGNED), # btUInt - 8: TypeEncoding(TypeKind.FLOAT, Signedness.NOT_APPLICABLE), # btFloat - 9: TypeEncoding(TypeKind.BCD), # btBCD - 10: TypeEncoding(TypeKind.BOOLEAN, Signedness.NOT_APPLICABLE), # btBool - 13: TypeEncoding(TypeKind.INTEGER, Signedness.SIGNED), # btLong (size captured in byte_size) - 14: TypeEncoding(TypeKind.INTEGER, Signedness.UNSIGNED), # btULong - 25: TypeEncoding(TypeKind.CURRENCY), # btCurrency - 26: TypeEncoding(TypeKind.DATE), # btDate - 27: TypeEncoding(TypeKind.VARIANT), # btVariant - 28: TypeEncoding(TypeKind.COMPLEX_FLOAT, Signedness.NOT_APPLICABLE), # btComplex - 29: TypeEncoding(TypeKind.BIT), # btBit - 30: TypeEncoding(TypeKind.BSTR), # btBSTR - 31: TypeEncoding(TypeKind.HRESULT), # btHresult - 32: TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.UTF16), # btChar16 – char16_t (C++11) - 33: TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.UTF32), # btChar32 – char32_t (C++11) - 34: TypeEncoding(TypeKind.CHAR, Signedness.UNSIGNED, CharEncoding.UTF8), # btChar8 – char8_t (C++20, always unsigned) -} - - -def type_encoding_from_pdb_bt(bt_value: int) -> TypeEncoding: - """Convert a PDB ``BasicType`` integer value to a :class:`TypeEncoding`. - - Args: - bt_value: Raw ``BasicType`` value from dbghelp/cvconst.h - (e.g. ``BasicType.btFloat`` = 8). - - Returns: - Matching :class:`TypeEncoding`, or ``TypeEncoding(TypeKind.UNKNOWN)`` for - unrecognised values. - - Example:: - - >>> from objutils.pecoff.pdb import BasicType - >>> type_encoding_from_pdb_bt(int(BasicType.btFloat)) - TypeEncoding(kind=FLOAT, signedness=NOT_APPLICABLE, char_encoding=UNSPECIFIED) - """ - return _PDB_BT_MAP.get(int(bt_value), TypeEncoding(TypeKind.UNKNOWN)) - - -# --------------------------------------------------------------------------- -# Symbol / type dataclasses -# --------------------------------------------------------------------------- - - -@dataclass -class PrimitiveType: - """A primitive / base type. - - Attributes: - name: Type name as it appears in the source (e.g. ``"int``, ``"float"``). - encoding: Format-neutral :class:`TypeEncoding` describing how the value - is encoded and interpreted. - byte_size: Storage size in bytes. - """ - - name: str - encoding: TypeEncoding - byte_size: int - - -@dataclass -class ArrayType: - type: TypeInfo - array_spec: list[tuple[int, int]] = field(default_factory=list) - - -@dataclass -class TypeDefinition: - name: str - type: TypeInfo - - -@dataclass -class VolatileType: - type: TypeInfo - - -@dataclass -class ConstantType: - type: TypeInfo - - -@dataclass -class PointerType: - type: TypeInfo - - -@dataclass -class ReferenceType: - type: TypeInfo - - -@dataclass -class Enumerator: - name: str - value: int - - -@dataclass -class EnumerationType: - """An enumeration type. - - Attributes: - name: Enumeration name. - byte_size: Storage size in bytes. - encoding: :class:`TypeEncoding` of the underlying integer type, - or ``None`` when not determinable. - base_type: Resolved underlying type (usually a :class:`PrimitiveType`). - enumerators: List of named enumeration constants. - """ - - name: str - byte_size: int - encoding: TypeEncoding | None - base_type: TypeInfo - enumerators: list[Enumerator] = field(default_factory=list) - - -@dataclass -class UnspecifiedType: - name: str - - -@dataclass -class StructMember: - name: str - type: TypeInfo - offset: int - - -@dataclass -class StructureType: - name: str - byte_size: int - member: list[StructMember] = field(default_factory=list) - - -@dataclass -class ClassMember: - name: str - linkage_name: str - type: TypeInfo - offset: int - accessibility: Any # Accessibility - external: bool - - -@dataclass -class ClassType: - name: str - byte_size: int - member: list[ClassMember] = field(default_factory=list) - - -@dataclass -class UnionType: - name: str - byte_size: int - alternatives: list[StructMember] = field(default_factory=list) - - -@dataclass -class SubroutineType: - name: str - prototyped: int - return_type: TypeInfo - parameters: list[TypeInfo] = field(default_factory=list) - - -@dataclass -class VariableType: - name: str - type: TypeInfo - location: int - size: int - - -@dataclass -class DataType: - name: str - value: Any - type: TypeInfo - datakind: Any - - -TypeInfo: TypeAlias = ( - PrimitiveType - | ArrayType - | TypeDefinition - | VolatileType - | ConstantType - | PointerType - | ReferenceType - | EnumerationType - | UnspecifiedType - | StructureType - | ClassType - | UnionType - | SubroutineType -) - -# Backward-compatible alias (legacy typo retained intentionally). -TypeDefiniton = TypeDefinition +"""General symbol abstraction that works on top of DWARF, PDB, or whatsoever. + +This module provides a format-neutral type system that can represent type information +from DWARF (ELF debug info), PDB (Windows debug info), or any other debug format. + +The central abstraction for primitive types is :class:`TypeEncoding`, which replaces +the previous format-specific ``encoding: Any`` fields. Two helper functions translate +format-specific values to :class:`TypeEncoding`: + +- :func:`type_encoding_from_dwarf_ate` – converts a DWARF ``DW_ATE_*`` integer value +- :func:`type_encoding_from_pdb_bt` – converts a PDB ``BasicType`` (``btXxx``) integer value +""" + +from __future__ import annotations + +import enum +from dataclasses import dataclass, field +from typing import Any, TypeAlias + +# --------------------------------------------------------------------------- +# Format-neutral type encoding +# --------------------------------------------------------------------------- + + +class TypeKind(enum.Enum): + """Fundamental type category, independent of any debug-format specifics. + + Groups: + Basic: VOID, BOOLEAN, ADDRESS, INTEGER, FLOAT + Float ext.: COMPLEX_FLOAT, IMAGINARY_FLOAT, DECIMAL_FLOAT + Characters: CHAR (repertoire stored in :class:`CharEncoding`) + Scaled: FIXED, PACKED_DECIMAL, NUMERIC_STRING, EDITED + Windows/COM: BCD, BIT, CURRENCY, DATE, VARIANT, HRESULT, BSTR + Fallback: UNKNOWN + """ + + VOID = "void" + BOOLEAN = "boolean" + ADDRESS = "address" + INTEGER = "integer" + FLOAT = "float" + COMPLEX_FLOAT = "complex_float" + IMAGINARY_FLOAT = "imaginary_float" + DECIMAL_FLOAT = "decimal_float" + CHAR = "char" + FIXED = "fixed" + PACKED_DECIMAL = "packed_decimal" + NUMERIC_STRING = "numeric_string" + EDITED = "edited" + BCD = "bcd" + BIT = "bit" + CURRENCY = "currency" + DATE = "date" + VARIANT = "variant" + HRESULT = "hresult" + BSTR = "bstr" + UNKNOWN = "unknown" + + +class Signedness(enum.Enum): + """Sign property of a type. + + Attributes: + SIGNED: Explicitly signed (e.g. ``int``, ``signed char``). + UNSIGNED: Explicitly unsigned (e.g. ``unsigned int``, ``uint8_t``). + NOT_APPLICABLE: Signedness is semantically meaningless for this kind + (float, void, bool, unicode char, address, …). + UNSPECIFIED: Signedness is theoretically applicable but not yet + determined – e.g. plain ``char`` whose signedness is + implementation-defined in C. + """ + + SIGNED = "signed" + UNSIGNED = "unsigned" + NOT_APPLICABLE = "n/a" + UNSPECIFIED = "unspecified" + + +class CharEncoding(enum.Enum): + """Character repertoire / encoding, only meaningful when :attr:`TypeKind` is ``CHAR``. + + Attributes: + UNSPECIFIED: Byte-sized ``char`` without a specified encoding (C ``char``). + ASCII: ISO/IEC 646:1991 – Fortran ``ASCII`` kind; DWARF ``DW_ATE_ASCII``. + UCS: ISO/IEC 10646 UCS-4 – Fortran ``ISO_10646`` kind; DWARF ``DW_ATE_UCS``. + UTF: ISO/IEC 10646-1:1993 (general Unicode); DWARF ``DW_ATE_UTF``. + UTF8: UTF-8 – C23/C++20 ``char8_t`` (unsigned); PDB ``btChar8``. + UTF16: UTF-16 – C++11 ``char16_t``; PDB ``btChar16``. + UTF32: UTF-32 – C++11 ``char32_t``; PDB ``btChar32``. + WIDE: Platform-defined wide character ``wchar_t``; PDB ``btWChar``. + """ + + UNSPECIFIED = "unspecified" + ASCII = "ascii" + UCS = "ucs" + UTF = "utf" + UTF8 = "utf8" + UTF16 = "utf16" + UTF32 = "utf32" + WIDE = "wide" + + +@dataclass(frozen=True) +class TypeEncoding: + """Format-neutral encoding descriptor for a primitive type. + + Combines a :class:`TypeKind` with optional :class:`Signedness` and + :class:`CharEncoding` qualifiers. The dataclass is *frozen* so instances + are hashable and can be used as dict or set keys. + + Args: + kind: Fundamental category of the type. + signedness: Sign property (defaults to :attr:`Signedness.NOT_APPLICABLE`). + char_encoding: Character repertoire for ``CHAR`` types + (defaults to :attr:`CharEncoding.UNSPECIFIED`). + + Examples:: + + >>> TypeEncoding(TypeKind.INTEGER, Signedness.SIGNED) + TypeEncoding(kind=INTEGER, signedness=SIGNED, char_encoding=UNSPECIFIED) + >>> TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.UTF16) + TypeEncoding(kind=CHAR, signedness=NOT_APPLICABLE, char_encoding=UTF16) + + Use :func:`type_encoding_from_dwarf_ate` and :func:`type_encoding_from_pdb_bt` + to construct instances from format-specific values. + """ + + kind: TypeKind + signedness: Signedness = Signedness.NOT_APPLICABLE + char_encoding: CharEncoding = CharEncoding.UNSPECIFIED + + # ------------------------------------------------------------------ + # Convenience predicates + # ------------------------------------------------------------------ + + def is_signed(self) -> bool: + """Return ``True`` if the type is explicitly signed.""" + return self.signedness == Signedness.SIGNED + + def is_unsigned(self) -> bool: + """Return ``True`` if the type is explicitly unsigned.""" + return self.signedness == Signedness.UNSIGNED + + def is_integer(self) -> bool: + """Return ``True`` for integer kinds (signed or unsigned).""" + return self.kind == TypeKind.INTEGER + + def is_float(self) -> bool: + """Return ``True`` for any floating-point kind.""" + return self.kind in (TypeKind.FLOAT, TypeKind.COMPLEX_FLOAT, TypeKind.IMAGINARY_FLOAT, TypeKind.DECIMAL_FLOAT) + + def is_char(self) -> bool: + """Return ``True`` for character types.""" + return self.kind == TypeKind.CHAR + + def is_void(self) -> bool: + """Return ``True`` for the void type.""" + return self.kind == TypeKind.VOID + + def is_boolean(self) -> bool: + """Return ``True`` for boolean types.""" + return self.kind == TypeKind.BOOLEAN + + def __str__(self) -> str: + parts: list[str] = [] + if self.signedness not in (Signedness.NOT_APPLICABLE, Signedness.UNSPECIFIED): + parts.append(self.signedness.value) + parts.append(self.kind.value) + if self.char_encoding != CharEncoding.UNSPECIFIED: + parts.append(f"({self.char_encoding.value})") + return " ".join(parts) + + def __repr__(self) -> str: + return ( + f"TypeEncoding(kind={self.kind.name}, signedness={self.signedness.name}, " f"char_encoding={self.char_encoding.name})" + ) + + +# --------------------------------------------------------------------------- +# Conversion: DWARF DW_ATE_* → TypeEncoding +# --------------------------------------------------------------------------- + +# Raw integer keys are the DW_ATE_* values from DWARF4 Table 5.1. +# HP vendor extensions (0x80–0x8B) are not listed here; they map to UNKNOWN. +_DWARF_ATE_MAP: dict[int, TypeEncoding] = { + 0x0: TypeEncoding(TypeKind.VOID), # void (compiler extension, not in spec) + 0x1: TypeEncoding(TypeKind.ADDRESS, Signedness.NOT_APPLICABLE), # DW_ATE_address + 0x2: TypeEncoding(TypeKind.BOOLEAN, Signedness.NOT_APPLICABLE), # DW_ATE_boolean + 0x3: TypeEncoding(TypeKind.COMPLEX_FLOAT, Signedness.NOT_APPLICABLE), # DW_ATE_complex_float + 0x4: TypeEncoding(TypeKind.FLOAT, Signedness.NOT_APPLICABLE), # DW_ATE_float + 0x5: TypeEncoding(TypeKind.INTEGER, Signedness.SIGNED), # DW_ATE_signed + 0x6: TypeEncoding(TypeKind.CHAR, Signedness.SIGNED, CharEncoding.UNSPECIFIED), # DW_ATE_signed_char + 0x7: TypeEncoding(TypeKind.INTEGER, Signedness.UNSIGNED), # DW_ATE_unsigned + 0x8: TypeEncoding(TypeKind.CHAR, Signedness.UNSIGNED, CharEncoding.UNSPECIFIED), # DW_ATE_unsigned_char + 0x9: TypeEncoding(TypeKind.IMAGINARY_FLOAT, Signedness.NOT_APPLICABLE), # DW_ATE_imaginary_float + 0xA: TypeEncoding(TypeKind.PACKED_DECIMAL), # DW_ATE_packed_decimal + 0xB: TypeEncoding(TypeKind.NUMERIC_STRING), # DW_ATE_numeric_string + 0xC: TypeEncoding(TypeKind.EDITED), # DW_ATE_edited + 0xD: TypeEncoding(TypeKind.FIXED, Signedness.SIGNED), # DW_ATE_signed_fixed + 0xE: TypeEncoding(TypeKind.FIXED, Signedness.UNSIGNED), # DW_ATE_unsigned_fixed + 0xF: TypeEncoding(TypeKind.DECIMAL_FLOAT, Signedness.NOT_APPLICABLE), # DW_ATE_decimal_float + 0x10: TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.UTF), # DW_ATE_UTF (char16_t / char32_t / u8 in C++) + 0x11: TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.UCS), # DW_ATE_UCS (Fortran ISO_10646) + 0x12: TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.ASCII), # DW_ATE_ASCII (Fortran ASCII kind) +} + + +def type_encoding_from_dwarf_ate(ate_value: int) -> TypeEncoding: + """Convert a DWARF ``DW_ATE_*`` integer value to a :class:`TypeEncoding`. + + Args: + ate_value: Raw ``DW_AT_encoding`` value (e.g. ``BaseTypeEncoding.signed`` = 5). + + Returns: + Matching :class:`TypeEncoding`, or ``TypeEncoding(TypeKind.UNKNOWN)`` for + unrecognised or vendor-extension values. + + Example:: + + >>> from objutils.dwarf.constants import BaseTypeEncoding + >>> type_encoding_from_dwarf_ate(int(BaseTypeEncoding.float)) + TypeEncoding(kind=FLOAT, signedness=NOT_APPLICABLE, char_encoding=UNSPECIFIED) + """ + return _DWARF_ATE_MAP.get(int(ate_value), TypeEncoding(TypeKind.UNKNOWN)) + + +# --------------------------------------------------------------------------- +# Conversion: PDB BasicType → TypeEncoding +# --------------------------------------------------------------------------- + +# Raw integer keys are the btXxx values from Microsoft cvconst.h. +_PDB_BT_MAP: dict[int, TypeEncoding] = { + 0: TypeEncoding(TypeKind.UNKNOWN), # btNoType + 1: TypeEncoding(TypeKind.VOID), # btVoid + 2: TypeEncoding(TypeKind.CHAR, Signedness.UNSPECIFIED, CharEncoding.ASCII), # btChar – plain C char (impl-defined signedness) + 3: TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.WIDE), # btWChar – wchar_t + 6: TypeEncoding(TypeKind.INTEGER, Signedness.SIGNED), # btInt + 7: TypeEncoding(TypeKind.INTEGER, Signedness.UNSIGNED), # btUInt + 8: TypeEncoding(TypeKind.FLOAT, Signedness.NOT_APPLICABLE), # btFloat + 9: TypeEncoding(TypeKind.BCD), # btBCD + 10: TypeEncoding(TypeKind.BOOLEAN, Signedness.NOT_APPLICABLE), # btBool + 13: TypeEncoding(TypeKind.INTEGER, Signedness.SIGNED), # btLong (size captured in byte_size) + 14: TypeEncoding(TypeKind.INTEGER, Signedness.UNSIGNED), # btULong + 25: TypeEncoding(TypeKind.CURRENCY), # btCurrency + 26: TypeEncoding(TypeKind.DATE), # btDate + 27: TypeEncoding(TypeKind.VARIANT), # btVariant + 28: TypeEncoding(TypeKind.COMPLEX_FLOAT, Signedness.NOT_APPLICABLE), # btComplex + 29: TypeEncoding(TypeKind.BIT), # btBit + 30: TypeEncoding(TypeKind.BSTR), # btBSTR + 31: TypeEncoding(TypeKind.HRESULT), # btHresult + 32: TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.UTF16), # btChar16 – char16_t (C++11) + 33: TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.UTF32), # btChar32 – char32_t (C++11) + 34: TypeEncoding(TypeKind.CHAR, Signedness.UNSIGNED, CharEncoding.UTF8), # btChar8 – char8_t (C++20, always unsigned) +} + + +def type_encoding_from_pdb_bt(bt_value: int) -> TypeEncoding: + """Convert a PDB ``BasicType`` integer value to a :class:`TypeEncoding`. + + Args: + bt_value: Raw ``BasicType`` value from dbghelp/cvconst.h + (e.g. ``BasicType.btFloat`` = 8). + + Returns: + Matching :class:`TypeEncoding`, or ``TypeEncoding(TypeKind.UNKNOWN)`` for + unrecognised values. + + Example:: + + >>> from objutils.pecoff.pdb import BasicType + >>> type_encoding_from_pdb_bt(int(BasicType.btFloat)) + TypeEncoding(kind=FLOAT, signedness=NOT_APPLICABLE, char_encoding=UNSPECIFIED) + """ + return _PDB_BT_MAP.get(int(bt_value), TypeEncoding(TypeKind.UNKNOWN)) + + +# --------------------------------------------------------------------------- +# Symbol / type dataclasses +# --------------------------------------------------------------------------- + + +@dataclass +class PrimitiveType: + """A primitive / base type. + + Attributes: + name: Type name as it appears in the source (e.g. ``"int``, ``"float"``). + encoding: Format-neutral :class:`TypeEncoding` describing how the value + is encoded and interpreted. + byte_size: Storage size in bytes. + """ + + name: str + encoding: TypeEncoding + byte_size: int + + +@dataclass +class ArrayType: + type: TypeInfo + array_spec: list[tuple[int, int]] = field(default_factory=list) + + +@dataclass +class TypeDefinition: + name: str + type: TypeInfo + + +@dataclass +class VolatileType: + type: TypeInfo + + +@dataclass +class ConstantType: + type: TypeInfo + + +@dataclass +class PointerType: + type: TypeInfo + + +@dataclass +class ReferenceType: + type: TypeInfo + + +@dataclass +class Enumerator: + name: str + value: int + + +@dataclass +class EnumerationType: + """An enumeration type. + + Attributes: + name: Enumeration name. + byte_size: Storage size in bytes. + encoding: :class:`TypeEncoding` of the underlying integer type, + or ``None`` when not determinable. + base_type: Resolved underlying type (usually a :class:`PrimitiveType`). + enumerators: List of named enumeration constants. + """ + + name: str + byte_size: int + encoding: TypeEncoding | None + base_type: TypeInfo + enumerators: list[Enumerator] = field(default_factory=list) + + +@dataclass +class UnspecifiedType: + name: str + + +@dataclass +class StructMember: + name: str + type: TypeInfo + offset: int + + +@dataclass +class StructureType: + name: str + byte_size: int + member: list[StructMember] = field(default_factory=list) + + +@dataclass +class ClassMember: + name: str + linkage_name: str + type: TypeInfo + offset: int + accessibility: Any # Accessibility + external: bool + + +@dataclass +class ClassType: + name: str + byte_size: int + member: list[ClassMember] = field(default_factory=list) + + +@dataclass +class UnionType: + name: str + byte_size: int + alternatives: list[StructMember] = field(default_factory=list) + + +@dataclass +class SubroutineType: + name: str + prototyped: int + return_type: TypeInfo + parameters: list[TypeInfo] = field(default_factory=list) + + +@dataclass +class VariableType: + name: str + type: TypeInfo + location: int + size: int + + +@dataclass +class DataType: + name: str + value: Any + type: TypeInfo + datakind: Any + + +TypeInfo: TypeAlias = ( + PrimitiveType + | ArrayType + | TypeDefinition + | VolatileType + | ConstantType + | PointerType + | ReferenceType + | EnumerationType + | UnspecifiedType + | StructureType + | ClassType + | UnionType + | SubroutineType +) + +# Backward-compatible alias (legacy typo retained intentionally). +TypeDefiniton = TypeDefinition