From d989510a05ef3026b21730bac92c81a80e8963fc Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
<66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 21:14:28 +0000
Subject: [PATCH 1/2] [pre-commit.ci] pre-commit autoupdate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
updates:
- [github.com/psf/black-pre-commit-mirror: 26.5.0 → 26.5.1](https://github.com/psf/black-pre-commit-mirror/compare/26.5.0...26.5.1)
- [github.com/astral-sh/ruff-pre-commit: v0.15.13 → v0.15.14](https://github.com/astral-sh/ruff-pre-commit/compare/v0.15.13...v0.15.14)
---
.pre-commit-config.yaml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 12a3299..aea4434 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -35,14 +35,14 @@ repos:
- id: trailing-whitespace
- repo: https://github.com/psf/black-pre-commit-mirror
- rev: 26.5.0
+ rev: 26.5.1
hooks:
- id: black
language_version: python3.13
args: ["--line-length=132"]
- repo: https://github.com/astral-sh/ruff-pre-commit
- rev: v0.15.13
+ rev: v0.15.14
hooks:
- id: ruff
args: ["--fix", "--exit-non-zero-on-fix"]
From e3f161242da46432193c28bcf8cef056242aa510 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
<66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 21:14:43 +0000
Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---
.idea/objutils@1.iml | 2 +-
objutils/elf/defs.py | 74 +-
objutils/pecoff/pdb/__init__.py | 2897 +++++++++++++++---------------
objutils/scripts/oj_coff_syms.py | 4 +-
objutils/symbols.py | 882 +++++----
5 files changed, 1931 insertions(+), 1928 deletions(-)
diff --git a/.idea/objutils@1.iml b/.idea/objutils@1.iml
index 8e33d7d..6276174 100644
--- a/.idea/objutils@1.iml
+++ b/.idea/objutils@1.iml
@@ -8,4 +8,4 @@
-
\ No newline at end of file
+
diff --git a/objutils/elf/defs.py b/objutils/elf/defs.py
index 620e7d0..7062a74 100644
--- a/objutils/elf/defs.py
+++ b/objutils/elf/defs.py
@@ -305,8 +305,8 @@ class ELFMachineType(enum.IntEnum):
EM_ETPU = 178 # Freescale Extended Time Processing Unit.
EM_SLE9X = 179 # Infineon Technologies SLE9X core.
EM_L1OM = 180 # Intel L1OM.
- EM_K10M = 181 # Intel K10M
- EM_AARCH64 = 183 # ARM AArch64
+ EM_K10M = 181 # Intel K10M
+ EM_AARCH64 = 183 # ARM AArch64
EM_AVR32 = 185 # Atmel Corporation 32-bit microprocessor family.
EM_STM8 = 186 # STMicroeletronics STM8 8-bit microcontroller.
EM_TILE64 = 187 # Tilera TILE64 multicore architecture family.
@@ -314,40 +314,40 @@ class ELFMachineType(enum.IntEnum):
EM_MICROBLAZE = 189 # Xilinx MicroBlaze 32-bit RISC soft processor core.
EM_CUDA = 190 # NVIDIA CUDA architecture.
EM_TILEGX = 191 # Tilera TILE-Gx
- EM_CLOUDSHIELD = 192, # CloudShield architecture family
- EM_COREA_1ST = 193, # KIPO-KAIST Core-A 1st generation processor family
- EM_COREA_2ND = 194, # KIPO-KAIST Core-A 2nd generation processor family
- EM_ARC_COMPACT2 = 195, # Synopsys ARCompact V2
- EM_OPEN8 = 196, # Open8 8-bit RISC soft processor core
- EM_RL78 = 197, # Renesas RL78 family
- EM_VIDEOCORE5 = 198, # Broadcom VideoCore V processor
- EM_78KOR = 199, # Renesas 78KOR family
- EM_56800EX = 200, # Freescale 56800EX Digital Signal Controller (DSC)
- EM_BA1 = 201, # Beyond BA1 CPU architecture
- EM_BA2 = 202, # Beyond BA2 CPU architecture
- EM_XCORE = 203, # XMOS xCORE processor family
- EM_MCHP_PIC = 204, # Microchip 8-bit PIC(r) family
- EM_INTEL205 = 205, # Reserved by Intel
- EM_INTEL206 = 206, # Reserved by Intel
- EM_INTEL207 = 207, # Reserved by Intel
- EM_INTEL208 = 208, # Reserved by Intel
- EM_INTEL209 = 209, # Reserved by Intel
- EM_KM32 = 210, # KM211 KM32 32-bit processor
- EM_KMX32 = 211, # KM211 KMX32 32-bit processor
- EM_KMX16 = 212, # KM211 KMX16 16-bit processor
- EM_KMX8 = 213, # KM211 KMX8 8-bit processor
- EM_KVARC = 214, # KM211 KVARC processor
- EM_CDP = 215, # Paneve CDP architecture family
- EM_COGE = 216, # Cognitive Smart Memory Processor
- EM_COOL = 217, # iCelero CoolEngine
- EM_NORC = 218, # Nanoradio Optimized RISC
- EM_CSR_KALIMBA = 219, # CSR Kalimba architecture family
- EM_AMDGPU = 224, # AMD GPU architecture
- EM_RISCV = 243, # RISC-V
- EM_LANAI = 244, # Lanai 32-bit processor
- EM_BPF = 247, # Linux kernel bpf virtual machine
- EM_VE = 251, # NEC SX-Aurora VE
- EM_CSKY = 252, # C-SKY 32-bit processor
+ EM_CLOUDSHIELD = (192,) # CloudShield architecture family
+ EM_COREA_1ST = (193,) # KIPO-KAIST Core-A 1st generation processor family
+ EM_COREA_2ND = (194,) # KIPO-KAIST Core-A 2nd generation processor family
+ EM_ARC_COMPACT2 = (195,) # Synopsys ARCompact V2
+ EM_OPEN8 = (196,) # Open8 8-bit RISC soft processor core
+ EM_RL78 = (197,) # Renesas RL78 family
+ EM_VIDEOCORE5 = (198,) # Broadcom VideoCore V processor
+ EM_78KOR = (199,) # Renesas 78KOR family
+ EM_56800EX = (200,) # Freescale 56800EX Digital Signal Controller (DSC)
+ EM_BA1 = (201,) # Beyond BA1 CPU architecture
+ EM_BA2 = (202,) # Beyond BA2 CPU architecture
+ EM_XCORE = (203,) # XMOS xCORE processor family
+ EM_MCHP_PIC = (204,) # Microchip 8-bit PIC(r) family
+ EM_INTEL205 = (205,) # Reserved by Intel
+ EM_INTEL206 = (206,) # Reserved by Intel
+ EM_INTEL207 = (207,) # Reserved by Intel
+ EM_INTEL208 = (208,) # Reserved by Intel
+ EM_INTEL209 = (209,) # Reserved by Intel
+ EM_KM32 = (210,) # KM211 KM32 32-bit processor
+ EM_KMX32 = (211,) # KM211 KMX32 32-bit processor
+ EM_KMX16 = (212,) # KM211 KMX16 16-bit processor
+ EM_KMX8 = (213,) # KM211 KMX8 8-bit processor
+ EM_KVARC = (214,) # KM211 KVARC processor
+ EM_CDP = (215,) # Paneve CDP architecture family
+ EM_COGE = (216,) # Cognitive Smart Memory Processor
+ EM_COOL = (217,) # iCelero CoolEngine
+ EM_NORC = (218,) # Nanoradio Optimized RISC
+ EM_CSR_KALIMBA = (219,) # CSR Kalimba architecture family
+ EM_AMDGPU = (224,) # AMD GPU architecture
+ EM_RISCV = (243,) # RISC-V
+ EM_LANAI = (244,) # Lanai 32-bit processor
+ EM_BPF = (247,) # Linux kernel bpf virtual machine
+ EM_VE = (251,) # NEC SX-Aurora VE
+ EM_CSKY = (252,) # C-SKY 32-bit processor
EM_AVR_OLD = 0x1057 # AVR magic number. Written in the absense of an ABI.
EM_MSP430_OLD = 0x1059 # MSP430 magic number. Written in the absense of everything.
EM_MT = 0x2530 # Morpho MT. Written in the absense of an ABI.
@@ -954,7 +954,7 @@ class ELFAbiType(enum.IntEnum):
# ELFOSABI_FIRST_ARCH = 64, // First architecture-specific OS ABI
# ELFOSABI_AMDGPU_HSA = 64, // AMD HSA runtime
ELFOSABI_C6000_LINUX = 65 # Linux TMS320C6000
- ELFOSABI_AMDGPU_MESA3D = 66 # AMD GCN GPUs (GFX6+) for MESA runtime
+ ELFOSABI_AMDGPU_MESA3D = 66 # AMD GCN GPUs (GFX6+) for MESA runtime
ELFOSABI_ARM = 97 # ARM
ELFOSABI_STANDALONE = 255 # Standalone (embedded) application
diff --git a/objutils/pecoff/pdb/__init__.py b/objutils/pecoff/pdb/__init__.py
index f52d1e2..13e93c3 100644
--- a/objutils/pecoff/pdb/__init__.py
+++ b/objutils/pecoff/pdb/__init__.py
@@ -1,1446 +1,1451 @@
-#!/usr/bin/env python
-
-"""PDB debug symbol integration for PE/COFF files (Windows only).
-
-This module provides access to Microsoft Program Database (PDB) debug information
-using the Windows dbghelp.dll API. It enables comprehensive symbol lookup beyond
-the typically stripped COFF symbol table in release binaries.
-
-**Platform Support**: Windows only (requires dbghelp.dll)
-
-Overview:
- PDB files contain rich debug information:
-
- - **Symbols**: Function names, variables, constants
- - **Types**: Structures, unions, enums, typedefs
- - **Source Info**: File names, line numbers
- - **Call Frames**: Stack unwinding data
-
- ```
- PE File + PDB:
- ┌──────────────┐ ┌──────────────┐
- │ app.exe │────>│ app.pdb │
- │ │ │ │
- │ Code │ │ - Symbols │
- │ Data │ │ - Types │
- │ (stripped) │ │ - Lines │
- └──────────────┘ └──────────────┘
- ```
-
-Architecture:
- **Windows dbghelp.dll**:
- - Microsoft's debug helper library
- - Symbol server support
- - Handles PDB loading and parsing
- - Provides symbol enumeration API
-
- **Symbol Enumeration**:
- 1. Initialize dbghelp session (SymInitialize)
- 2. Load PE module (SymLoadModuleExW)
- 3. Set symbol search paths
- 4. Enumerate symbols (SymEnumSymbolsA with callback)
- 5. Extract type information (optional)
- 6. Cleanup (SymCleanup)
-
- **Type Information Extraction**:
- - Uses dbghelp type info API (SymGetTypeInfo)
- - Recursively resolves pointers, arrays, structs
- - Extracts sizes, offsets, field names
-
-Usage Examples:
- **Basic Symbol Extraction**:
- ```python
- from objutils.pecoff.pdb import pdb_symbols_for_pe
-
- # Load symbols from PDB
- symbols = pdb_symbols_for_pe("app.exe")
-
- for sym in symbols:
- print(f"{sym['name']:40s} @ {sym['address']:#010x}")
- ```
-
- **With Symbol Search Path**:
- ```python
- # Search multiple directories for PDB
- symbols = pdb_symbols_for_pe(
- "app.exe",
- symbol_path="C:\\Symbols;SRV*C:\\SymCache*https://msdl.microsoft.com/download/symbols"
- )
- ```
-
- **Advanced Session Management**:
- ```python
- from objutils.pecoff.pdb import PdbSession
-
- with PdbSession("app.exe", symbol_path=[".", "C:\\Symbols"]) as pdb:
- # Enumerate all symbols
- for sym in pdb.enum_symbols():
- if sym.is_function():
- print(f"Function: {sym.name} @ {sym.Address:#x}")
-
- # Get module info
- info = pdb.get_module_info()
- print(f"Module base: {info.base_of_dll:#x}")
- ```
-
- **Type Information Extraction**:
- ```python
- from objutils.pecoff.pdb import CTypeInfoDump
-
- # Extract C type definitions
- type_dumper = CTypeInfoDump(pdb_session.handle, base_address)
- type_info = type_dumper.get_type_from_type_index(type_idx)
- print(f"Type: {type_info['type_name']}, Size: {type_info['size']}")
- ```
-
-Key Components:
- **Enums**:
- - **SymTagEnum**: Symbol tag types (function, data, UDT, etc.)
- - **BasicType**: Primitive types (int, float, void, etc.)
- - **SymFlag**: Symbol flags (export, local, function, etc.)
- - **IMAGEHLP_SYMBOL_TYPE_INFO**: Type info query constants
-
- **Data Classes**:
- - **ModuleInfo**: Module metadata (base address, size, entry point)
- - **SYMBOL_INFO**: Symbol information structure (ctypes)
- - **MODULEINFO**: Windows API module info structure
-
- **Core Classes**:
- - **CTypeInfoDump**: Type information extraction and resolution
- - **PdbSession**: Manages dbghelp.dll lifetime and operations
-
-dbghelp.dll API:
- The module wraps these key dbghelp functions:
-
- - **SymInitialize**: Initialize symbol handler
- - **SymCleanup**: Cleanup symbol handler
- - **SymLoadModuleExW**: Load module for symbol resolution
- - **SymEnumSymbolsA**: Enumerate symbols with callback
- - **SymGetTypeInfo**: Query type information
- - **SymSetSearchPath/SymGetSearchPath**: Symbol path management
-
-Symbol Search Paths:
- dbghelp supports flexible symbol search:
-
- - **Local paths**: "C:\\Symbols;D:\\Debug"
- - **Symbol servers**: "SRV*C:\\Cache*https://msdl.microsoft.com/download/symbols"
- - **Combined**: "C:\\Local;SRV*C:\\Cache*https://server"
-
- The `_NT_SYMBOL_PATH` environment variable is respected.
-
-Limitations:
- - **Windows only**: Requires dbghelp.dll (unavailable on Linux/Mac)
- - **PDB required**: Release binaries typically lack embedded COFF symbols
- - **Architecture match**: PDB must match PE architecture (x86/x64)
- - **Version match**: PDB should match PE build (GUID/age check)
- - **Type info**: Complex recursive structures may have limitations
-
-Error Handling:
- On non-Windows platforms, dbghelp/kernel32/psapi are set to None:
-
- ```python
- from objutils.pecoff.pdb import _WINDOWS
-
- if not _WINDOWS:
- print("PDB support unavailable (not Windows)")
- ```
-
- Import errors are caught and gracefully handled in __init__.py.
-
-See Also:
- - objutils.pecoff: Main PE parser that uses this module
- - objutils.pecoff.defs: PE/COFF constants
- - objutils.elf.model: Similar ORM pattern for ELF
- - Microsoft dbghelp.dll documentation
- - PDB format specification
-
-Example Integration:
- ```python
- from objutils.pecoff import PeParser
-
- # PeParser automatically attempts PDB loading
- pe = PeParser("kernel32.dll", pdb_path=["C:\\Symbols"])
-
- # Symbols now include PDB data if found
- for sym in pe.symbols:
- print(f"{sym['name']}: {sym['value']:#x}")
- ```
-"""
-
-import ctypes
-import enum
-from copy import copy
-from ctypes import wintypes
-from dataclasses import dataclass
-from enum import IntEnum
-from functools import lru_cache
-from typing import Any, Optional
-
-# DLLs
-try:
- dbghelp = ctypes.WinDLL("dbghelp") # type: ignore[attr-defined]
- kernel32 = ctypes.WinDLL("kernel32") # type: ignore[attr-defined]
- psapi = ctypes.WinDLL("psapi")
- _WINDOWS = True
-except OSError: # pragma: no cover - non-Windows environment
- dbghelp = None # type: ignore[assignment]
- kernel32 = None # type: ignore[assignment]
- psapi = None
- _WINDOWS = False
-
-from objutils import symbols
-
-
-@dataclass
-class ModuleInfo:
- """Module metadata extracted from Windows process.
-
- Attributes:
- base_of_dll: Base address where module is loaded in memory
- size_of_image: Size of module in memory (bytes)
- entry_point: Address of module entry point (or None)
-
- Example:
- ```python
- info = pdb_session.get_module_info()
- print(f"Module: {info.base_of_dll:#x} - {info.base_of_dll + info.size_of_image:#x}")
- ```
- """
-
- base_of_dll: int
- size_of_image: int
- entry_point: Optional[int]
-
-
-# Types
-HANDLE = wintypes.HANDLE
-HLOCAL = wintypes.HANDLE
-DWORD = wintypes.DWORD
-ULONG = wintypes.ULONG
-ULONG64 = ctypes.c_ulonglong
-BOOL = wintypes.BOOL
-LPVOID = wintypes.LPVOID
-LPCWSTR = wintypes.LPCWSTR
-LPCSTR = wintypes.LPCSTR
-
-
-class VARTYPE(IntEnum):
- """OLE Automation variant type discriminator (vt field of VARIANT)."""
-
- VT_EMPTY = 0
- VT_NULL = 1
- VT_I2 = 2
- VT_I4 = 3
- VT_R4 = 4
- VT_R8 = 5
- VT_BSTR = 8
- VT_BOOL = 11
- VT_I1 = 16
- VT_UI1 = 17
- VT_UI2 = 18
- VT_UI4 = 19
- VT_I8 = 20
- VT_UI8 = 21
- VT_INT = 22
- VT_UINT = 23
-
-
-class _VARIANT_VALUE(ctypes.Union):
- """Inner value union of a COM VARIANT (covers numeric and pointer cases)."""
-
- _fields_ = [
- ("llVal", ctypes.c_longlong),
- ("lVal", ctypes.c_long),
- ("bVal", ctypes.c_ubyte),
- ("iVal", ctypes.c_short),
- ("fltVal", ctypes.c_float),
- ("dblVal", ctypes.c_double),
- ("boolVal", ctypes.c_short),
- ("scode", ctypes.c_long),
- ("cVal", ctypes.c_int8),
- ("uiVal", ctypes.c_ushort),
- ("ulVal", ctypes.c_ulong),
- ("ullVal", ctypes.c_ulonglong),
- ("intVal", ctypes.c_int),
- ("uintVal", ctypes.c_uint),
- ("byref", ctypes.c_void_p),
- ]
-
-
-class VARIANT(ctypes.Structure):
- """Minimal ctypes representation of the OLE Automation VARIANT structure.
-
- The full COM VARIANT is a discriminated union keyed on the ``vt`` field.
- Only the scalar numeric types that can appear as PDB constant values are
- covered here; pointer/array/record sub-types are not needed.
-
- Total size is 16 bytes (matching the Windows ABI definition).
- """
-
- _fields_ = [
- ("vt", ctypes.c_ushort),
- ("wReserved1", ctypes.c_ushort),
- ("wReserved2", ctypes.c_ushort),
- ("wReserved3", ctypes.c_ushort),
- ("_value", _VARIANT_VALUE),
- ]
-
-
-def _variant_to_python(variant):
- """Convert a VARIANT value to an appropriate Python primitive.
-
- Only the numeric VARTYPE values that are relevant for PDB constant symbols
- are handled. Unknown or unsupported types return ``None``.
- """
- try:
- kind = VARTYPE(variant.vt)
- except ValueError:
- return None
- v = variant._value
- _map = {
- VARTYPE.VT_I1: lambda: int(v.cVal),
- VARTYPE.VT_I2: lambda: int(v.iVal),
- VARTYPE.VT_I4: lambda: int(v.lVal),
- VARTYPE.VT_I8: lambda: int(v.llVal),
- VARTYPE.VT_UI1: lambda: int(v.bVal),
- VARTYPE.VT_UI2: lambda: int(v.uiVal),
- VARTYPE.VT_UI4: lambda: int(v.ulVal),
- VARTYPE.VT_UI8: lambda: int(v.ullVal),
- VARTYPE.VT_INT: lambda: int(v.intVal),
- VARTYPE.VT_UINT: lambda: int(v.uintVal),
- VARTYPE.VT_R4: lambda: float(v.fltVal),
- VARTYPE.VT_R8: lambda: float(v.dblVal),
- VARTYPE.VT_BOOL: lambda: bool(v.boolVal),
- }
- fn = _map.get(kind)
- return fn() if fn is not None else None
-
-
-class TI_FINDCHILDREN_PARAMS(ctypes.Structure):
- _fields_ = [
- ("Count", ULONG),
- ("Start", ULONG),
- ("ChildId", ULONG * 1),
- ]
-
-
-# SYMBOL_INFO struct (ANSI)
-MAX_SYM_NAME = 2000
-
-
-class SYMBOL_INFO(ctypes.Structure):
- """Windows API structure for symbol information.
-
- Used with dbghelp.dll SymEnumSymbolsA to enumerate symbols.
- Contains detailed information about a symbol including name, address,
- flags, and type information.
-
- Key Attributes:
- Name: Symbol name (null-terminated char array)
- Address: Absolute address in memory
- ModBase: Module base address
- Flags: Symbol flags (SymFlag enum values)
- Tag: Symbol tag type (SymTagEnum values)
- Size: Symbol size in bytes
- Value: Symbol value (for constants)
-
- Helper Methods:
- is_function(): True if symbol is a function
- is_export(): True if symbol is exported
- is_local(): True if symbol is local variable
- is_parameter(): True if symbol is function parameter
- decode_flags(): List of flag names
-
- Properties:
- name: Decoded symbol name (str)
- tag: Symbol tag name (str)
- rel_address: Relative address (Address - ModBase)
-
- Example:
- ```python
- # Used in enumeration callback
- def callback(sym_info, size, context):
- sym = ctypes.cast(sym_info, ctypes.POINTER(SYMBOL_INFO)).contents
- if sym.is_function():
- print(f"Function: {sym.name} @ {sym.Address:#x}")
- return True # Continue enumeration
- ```
- """
-
- def is_clr_token(self) -> bool:
- """Check if symbol is a CLR token (.NET managed code)."""
- return bool(self.Flags & SymFlag.SYMFLAG_CLR_TOKEN)
- return bool(self.Flags & SymFlag.SYMFLAG_CLR_TOKEN)
-
- def is_constant(self) -> bool:
- """Check if symbol is a constant value."""
- return bool(self.Flags & SymFlag.SYMFLAG_CONSTANT)
-
- def is_export(self) -> bool:
- """Check if symbol is exported from module."""
- return bool(self.Flags & SymFlag.SYMFLAG_EXPORT)
-
- def is_forwarder(self) -> bool:
- """Check if symbol is an export forwarder."""
- return bool(self.Flags & SymFlag.SYMFLAG_FORWARDER)
-
- def is_framerel(self) -> bool:
- """Check if symbol is frame-relative (stack variable)."""
- return bool(self.Flags & SymFlag.SYMFLAG_FRAMEREL)
-
- def is_function(self) -> bool:
- """Check if symbol is a function."""
- return bool(self.Flags & SymFlag.SYMFLAG_FUNCTION)
-
- def is_ilrel(self) -> bool:
- """Check if symbol is IL-relative (.NET managed code)."""
- return bool(self.Flags & SymFlag.SYMFLAG_ILREL)
-
- def is_local(self) -> bool:
- """Check if symbol is a local variable."""
- return bool(self.Flags & SymFlag.SYMFLAG_LOCAL)
-
- def is_metadata(self) -> bool:
- """Check if symbol is metadata."""
- return bool(self.Flags & SymFlag.SYMFLAG_METADATA)
-
- def is_parameter(self) -> bool:
- """Check if symbol is a function parameter."""
- return bool(self.Flags & SymFlag.SYMFLAG_PARAMETER)
-
- def is_register(self) -> bool:
- """Check if symbol is in a register."""
- return bool(self.Flags & SymFlag.SYMFLAG_REGISTER)
-
- def is_regrel(self) -> bool:
- """Check if symbol is register-relative."""
- return bool(self.Flags & SymFlag.SYMFLAG_REGREL)
-
- def is_slot(self) -> bool:
- """Check if symbol is a slot (.NET managed code)."""
- return bool(self.Flags & SymFlag.SYMFLAG_SLOT)
-
- def is_thunk(self) -> bool:
- """Check if symbol is a thunk (jump stub)."""
- return bool(self.Flags & SymFlag.SYMFLAG_THUNK)
-
- def is_tlsrel(self) -> bool:
- """Check if symbol is thread-local storage relative."""
- return bool(self.Flags & SymFlag.SYMFLAG_TLSREL)
-
- def is_value_present(self) -> bool:
- """Check if symbol has value field populated."""
- return bool(self.Flags & SymFlag.SYMFLAG_VALUEPRESENT)
-
- def is_virtual(self) -> bool:
- """Check if symbol is virtual."""
- return bool(self.Flags & SymFlag.SYMFLAG_VIRTUAL)
-
- # @cached_property
- def decode_flags(self) -> list[str]:
- """Decode Flags field to list of flag names.
-
- Returns:
- List of flag names (e.g., ["SYMFLAG_FUNCTION", "SYMFLAG_EXPORT"])
- """
- return [f.name for f in SymFlag if self.Flags & f.value]
-
- # @cached_property
- @property
- def name(self):
- """Get symbol name as decoded string.
-
- Returns:
- Symbol name (str), ignoring decode errors
- """
- return self.Name.decode(errors="ignore")
-
- # @cached_property
- @property
- def tag(self):
- """Get symbol tag name.
-
- Returns:
- Tag name (e.g., "SymTagFunction"), or "SymTagNull" if invalid
- """
- try:
- return SymTagEnum(self.Tag).name
- except ValueError:
- return SymTagEnum.SymTagNull.name
-
- # @cached_property
- @property
- def rel_address(self):
- """Get symbol address relative to module base.
-
- Returns:
- Relative virtual address (RVA)
- """
- return self.Address - (0 if self.ModBase is None else self.ModBase)
-
- def __repr__(self) -> str:
- name = self.Name.decode(errors="ignore")
- return f""
-
- _fields_ = [
- ("SizeOfStruct", ULONG),
- ("TypeIndex", ULONG),
- ("Reserved", ULONG64 * 2),
- ("Index", ULONG),
- ("Size", ULONG),
- ("ModBase", ULONG64),
- ("Flags", ULONG),
- ("Value", ULONG64),
- ("Address", ULONG64),
- ("Register", ULONG),
- ("Scope", ULONG),
- ("Tag", ULONG),
- ("NameLen", ULONG),
- ("MaxNameLen", ULONG),
- ("Name", ctypes.c_char * (MAX_SYM_NAME + 1)),
- ]
-
-
-# MODULEINFO struct
-class MODULEINFO(ctypes.Structure):
- """Windows API structure for module information (from psapi.dll).
-
- Used with GetModuleInformation to query module metadata.
- """
-
- _fields_ = [
- ("lpBaseOfDll", LPVOID),
- ("SizeOfImage", DWORD),
- ("EntryPoint", LPVOID),
- ]
-
-
-class SymTagEnum(IntEnum):
- """Symbol tag types for PDB symbols.
-
- Defines the kind of symbol (function, data, type, etc.).
- Used in SYMBOL_INFO.Tag field.
-
- Common Values:
- SymTagFunction (5): Function symbol
- SymTagData (7): Variable symbol
- SymTagPublicSymbol (10): Exported symbol
- SymTagUDT (11): User-defined type (struct/class)
- SymTagEnum (12): Enumeration type
- SymTagPointerType (14): Pointer type
- SymTagArrayType (15): Array type
- SymTagBaseType (16): Primitive type
- """
-
- SymTagNull = 0
- SymTagExe = 1
- SymTagCompiland = 2
- SymTagCompilandDetails = 3
- SymTagCompilandEnv = 4
- SymTagFunction = 5
- SymTagBlock = 6
- SymTagData = 7
- SymTagAnnotation = 8
- SymTagLabel = 9
- SymTagPublicSymbol = 10
- SymTagUDT = 11
- SymTagEnum = 12
- SymTagFunctionType = 13
- SymTagPointerType = 14
- SymTagArrayType = 15
- SymTagBaseType = 16
- SymTagTypedef = 17
- SymTagBaseClass = 18
- SymTagFriend = 19
- SymTagFunctionArgType = 20
- SymTagFuncDebugStart = 21
- SymTagFuncDebugEnd = 22
- SymTagUsingNamespace = 23
- SymTagVTableShape = 24
- SymTagVTable = 25
- SymTagCustom = 26
- SymTagThunk = 27
- SymTagCustomType = 29
- SymTagManagedType = 30
- SymTagDimension = 31
-
-
-class UdtKind(enum.IntEnum):
- UdtStruct = 0
- UdtClass = 1
- UdtUnion = 2
-
-
-class IMAGEHLP_SYMBOL_TYPE_INFO(IntEnum):
- """Constants for SymGetTypeInfo queries.
-
- Used with dbghelp.SymGetTypeInfo to query type information.
-
- Common Queries:
- TI_GET_SYMTAG (0): Get symbol tag
- TI_GET_SYMNAME (1): Get symbol name
- TI_GET_LENGTH (2): Get type size in bytes
- TI_GET_TYPE (3): Get type index
- TI_GET_BASETYPE (5): Get base type (BasicType enum)
- TI_GET_CHILDRENCOUNT (13): Get count of child members
- TI_GET_OFFSET (10): Get member offset in struct
- """
-
- TI_GET_SYMTAG = 0
- TI_GET_SYMNAME = 1
- TI_GET_LENGTH = 2
- TI_GET_TYPE = 3
- TI_GET_TYPEID = 4
- TI_GET_BASETYPE = 5
- TI_GET_ARRAYINDEXTYPEID = 6
- TI_FINDCHILDREN = 7
- TI_GET_DATAKIND = 8
- TI_GET_ADDRESSOFFSET = 9
- TI_GET_OFFSET = 10
- TI_GET_VALUE = 11
- TI_GET_COUNT = 12
- TI_GET_CHILDRENCOUNT = 13
- TI_GET_BITPOSITION = 14
- TI_GET_VIRTUALBASECLASS = 15
- TI_GET_VIRTUALTABLESHAPEID = 16
- TI_GET_VIRTUALBASEPOINTEROFFSET = 17
- TI_GET_CLASSTYPEID = 18
- TI_GET_NESTED = 19
- TI_GET_SYMINDEX = 20
- TI_GET_LEXICALPARENT = 21
- TI_GET_ADDRESS = 22
- TI_GET_THISADJUST = 23
- TI_GET_UDTKIND = 24
- TI_IS_EQUIV_TO = 25
- TI_GET_CALLING_CONVENTION = 26
- TI_IS_CLOSE_EQUIV_TO = 27
- TI_GTIEX_REQS_VALID = 28
- TI_GET_VIRTUALBASEOFFSET = 29
- TI_GET_VIRTUALBASEDISPINDEX = 30
- TI_GET_IS_REFERENCE = 31
- TI_GET_INDIRECTVIRTUALBASEDISPINDEX = 32
- TI_GET_VIRTUALBASETABLETYPEID = 33
- TI_GET_OBJECTPOINTERTYPEID = 34
- TI_GET_IS_CONST = 35
- TI_GET_IS_VOLATILE = 36
- TI_GET_IS_UNALIGNED = 37
-
-
-class BasicType(IntEnum):
- """Primitive type identifiers for PDB types.
-
- Used with TI_GET_BASETYPE query to identify base types.
-
- Common Types:
- btVoid (1): void type
- btChar (2): char type
- btInt (6): signed integer
- btUInt (7): unsigned integer
- btFloat (8): floating point
- btBool (10): boolean
- btLong (13): long integer
- btULong (14): unsigned long
- """
-
- btNoType = 0
- btVoid = 1
- btChar = 2
- btWChar = 3
- btInt = 6
- btUInt = 7
- btFloat = 8
- btBCD = 9
- btBool = 10
- btLong = 13
- btULong = 14
- btCurrency = 25
- btDate = 26
- btVariant = 27
- btComplex = 28
- btBit = 29
- btBSTR = 30
- btHresult = 31
- btChar16 = 32
- btChar32 = 33
- btChar8 = 34
-
-
-PRIMITIVE_TYPEMAP = {
- BasicType.btNoType: "",
- BasicType.btVoid: "void",
- BasicType.btChar: "char",
- BasicType.btWChar: "wchar",
- BasicType.btInt: "signed int",
- BasicType.btUInt: "unsigned int",
- BasicType.btFloat: "float",
- BasicType.btBCD: "BCD",
- BasicType.btBool: "bool",
- BasicType.btLong: "long",
- BasicType.btULong: "unsigned long",
- BasicType.btCurrency: "CURRENCY",
- BasicType.btDate: "DATE",
- BasicType.btVariant: "VARIANT",
- BasicType.btComplex: "complex",
- BasicType.btBit: "BIT",
- BasicType.btBSTR: "BSTR",
- BasicType.btHresult: "HRESULT",
- BasicType.btChar16: "Char16",
- BasicType.btChar32: "Char32",
- BasicType.btChar8: "Char8",
-}
-
-
-class DataKind(enum.IntEnum):
- DataIsUnknown = 0
- DataIsLocal = 1
- DataIsStaticLocal = 2
- DataIsParam = 3
- DataIsObjectPtr = 4
- DataIsFileStatic = 5
- DataIsGlobal = 6
- DataIsMember = 7
- DataIsStaticMember = 8
- DataIsConstant = 9
-
-
-class SymFlag(IntEnum):
- """Symbol flags for SYMBOL_INFO.Flags field.
-
- Bit flags indicating symbol properties.
-
- Common Flags:
- SYMFLAG_FUNCTION (0x800): Symbol is a function
- SYMFLAG_EXPORT (0x200): Symbol is exported
- SYMFLAG_LOCAL (0x80): Symbol is local variable
- SYMFLAG_PARAMETER (0x40): Symbol is function parameter
- SYMFLAG_REGISTER (0x8): Symbol is in register
- SYMFLAG_CONSTANT (0x100): Symbol is a constant
- SYMFLAG_VALUEPRESENT (0x1): Value field is valid
- """
-
- SYMFLAG_VALUEPRESENT = 0x00000001
- SYMFLAG_REGISTER = 0x00000008
- SYMFLAG_REGREL = 0x00000010
- SYMFLAG_FRAMEREL = 0x00000020
- SYMFLAG_PARAMETER = 0x00000040
- SYMFLAG_LOCAL = 0x00000080
- SYMFLAG_CONSTANT = 0x00000100
- SYMFLAG_EXPORT = 0x00000200
- SYMFLAG_FORWARDER = 0x00000400
- SYMFLAG_FUNCTION = 0x00000800
- SYMFLAG_VIRTUAL = 0x00001000
- SYMFLAG_THUNK = 0x00002000
- SYMFLAG_TLSREL = 0x00004000
- SYMFLAG_SLOT = 0x00008000
- SYMFLAG_ILREL = 0x00010000
- SYMFLAG_METADATA = 0x00020000
- SYMFLAG_CLR_TOKEN = 0x00040000
-
-
-# Prototypes
-if _WINDOWS:
- dbghelp.SymInitialize.argtypes = [HANDLE, LPCWSTR, BOOL]
- dbghelp.SymInitialize.restype = BOOL
-
- dbghelp.SymCleanup.argtypes = [HANDLE]
- dbghelp.SymCleanup.restype = BOOL
-
- dbghelp.SymSetOptions.argtypes = [DWORD]
- dbghelp.SymSetOptions.restype = DWORD
-
- dbghelp.SymGetOptions.argtypes = []
- dbghelp.SymGetOptions.restype = DWORD
-
- dbghelp.SymLoadModuleExW.argtypes = [HANDLE, HANDLE, LPCWSTR, LPCWSTR, ULONG64, DWORD, LPVOID, DWORD]
- dbghelp.SymLoadModuleExW.restype = ULONG64 # returns base
-
- dbghelp.SymSetSearchPath.argtypes = [HANDLE, LPCSTR]
- dbghelp.SymSetSearchPath.restype = BOOL
-
- dbghelp.SymGetSearchPath.argtypes = [HANDLE, ctypes.c_char_p, DWORD]
- dbghelp.SymGetSearchPath.restype = BOOL
-
-# SymEnumSymbolsA callback and function
-if _WINDOWS:
- PSYM_ENUMERATESYMBOLS_CALLBACK = ctypes.WINFUNCTYPE(
- BOOL,
- ctypes.POINTER(SYMBOL_INFO),
- ULONG,
- LPVOID,
- )
-
-if _WINDOWS:
- dbghelp.SymEnumSymbols.argtypes = [HANDLE, ULONG64, LPCSTR, PSYM_ENUMERATESYMBOLS_CALLBACK, LPVOID]
- dbghelp.SymEnumSymbols.restype = BOOL
-
-# SymFromAddr
-if _WINDOWS:
- dbghelp.SymFromAddr.argtypes = [HANDLE, ULONG64, ctypes.POINTER(ULONG64), ctypes.POINTER(SYMBOL_INFO)]
- dbghelp.SymFromAddr.restype = BOOL
-
- dbghelp.SymGetTypeInfo.argtypes = [HANDLE, ULONG64, ULONG, ctypes.c_int, LPVOID]
- dbghelp.SymGetTypeInfo.restype = BOOL
-
-# Kernel32 helpers
-if _WINDOWS:
- kernel32.GetCurrentProcess.restype = HANDLE
- psapi.GetModuleInformation.argtypes = [HANDLE, HANDLE, ctypes.POINTER(MODULEINFO), DWORD]
- psapi.GetModuleInformation.restype = BOOL
- kernel32.GetLastError.restype = DWORD
- kernel32.LoadLibraryA.argtypes = [LPCSTR]
- kernel32.LoadLibraryA.restype = HANDLE
- kernel32.FreeLibrary.argtypes = [HANDLE]
- kernel32.FreeLibrary.restype = BOOL
- kernel32.LocalFree.argtypes = [HLOCAL]
- kernel32.LocalFree.restype = HLOCAL
-
-
-def last_error():
- if not _WINDOWS:
- return 0
- return kernel32.GetLastError()
-
-
-# SYMOPT flags (subset)
-SYMOPT_DEFERRED_LOADS = 0x00000004
-SYMOPT_UNDNAME = 0x00000002
-SYMOPT_LOAD_LINES = 0x00000010
-
-
-def load_library(lib_path: str) -> HANDLE:
- """Loads the specified module into the address space of the calling process."""
- if not _WINDOWS:
- raise OSError("PDB support requires Windows (kernel32.dll)")
- handle = kernel32.LoadLibraryA(lib_path.encode("ascii"))
- if not handle:
- raise OSError(f"LoadLibraryA failed for {lib_path}, error={last_error()}")
- return handle
-
-
-def free_library(hmod: HANDLE) -> None:
- """Frees the loaded dynamic-link library (DLL) module."""
- if not _WINDOWS:
- raise OSError("PDB support requires Windows (kernel32.dll)")
- if not kernel32.FreeLibrary(hmod):
- raise OSError(f"FreeLibrary failed, error={last_error()}")
-
-
-class CTypeInfoDump:
- """Extracts C type information from PDB debug symbols.
-
- Recursively resolves type definitions including pointers, arrays,
- structures, unions, and base types. Uses dbghelp.SymGetTypeInfo
- to query type metadata.
-
- Attributes:
- process: dbghelp process handle
- mod_base: Module base address
-
- Type Resolution Algorithm:
- 1. Query type tag (pointer, array, UDT, base type, etc.)
- 2. For compound types:
- - Pointer: Resolve pointed-to type
- - Array: Resolve element type and count
- - UDT: Enumerate members recursively
- 3. Calculate sizes and offsets
- 4. Build type dictionary with metadata
-
- Example:
- ```python
- type_dumper = CTypeInfoDump(pdb_session.handle, base_address)
-
- # Get type info for a symbol
- type_info = type_dumper.get_type_from_type_index(type_idx)
- print(f"Type: {type_info['type_name']}")
- print(f"Size: {type_info['size']} bytes")
-
- # For struct, enumerate members
- if 'members' in type_info:
- for member in type_info['members']:
- print(f" {member['name']}: {member['type_name']} @ +{member['offset']}")
- ```
-
- Note:
- Type resolution can be slow for complex recursive structures.
- Use caching when querying multiple symbols.
- """
-
- def __init__(self, process, mod_base):
- """Initialize type info dumper.
-
- Args:
- process: dbghelp process handle from PdbSession
- mod_base: Module base address
- """
- self.process = process
- self.mod_base = mod_base
- self._type_cache: dict[int, symbols.TypeInfo] = {}
- self._resolving: set[int] = set()
-
- def get_type_info(self, type_id, info_type):
- """Query type information from dbghelp.
-
- Args:
- type_id: Type index to query
- info_type: IMAGEHLP_SYMBOL_TYPE_INFO constant
-
- Returns:
- Type information value (type depends on info_type):
- - String for TI_GET_SYMNAME
- - Integer for TI_GET_LENGTH, TI_GET_COUNT, etc.
- - Boolean for TI_GET_IS_CONST, TI_GET_IS_VOLATILE, etc.
- - None if query fails
-
- Note:
- Different info_type values return different data types.
- Memory for strings (TI_GET_SYMNAME) is automatically freed.
- """
- if not _WINDOWS:
- return None
- if info_type in (IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME,):
- ptr = ctypes.c_void_p()
- if dbghelp.SymGetTypeInfo(self.process, self.mod_base, type_id, info_type.value, ctypes.byref(ptr)):
- if ptr.value:
- name = ctypes.wstring_at(ptr)
- kernel32.LocalFree(ptr)
- return name
- elif info_type in (
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMTAG,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_BASETYPE,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_TYPE,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_TYPEID,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_ARRAYINDEXTYPEID,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_COUNT,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_CHILDRENCOUNT,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_OFFSET,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_DATAKIND,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_UDTKIND,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_ADDRESSOFFSET,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_BITPOSITION,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VIRTUALTABLESHAPEID,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VIRTUALBASEPOINTEROFFSET,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_CLASSTYPEID,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_NESTED,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMINDEX,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LEXICALPARENT,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_THISADJUST,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_IS_EQUIV_TO,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_CALLING_CONVENTION,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_IS_CLOSE_EQUIV_TO,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VIRTUALBASEOFFSET,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VIRTUALBASEDISPINDEX,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_OBJECTPOINTERTYPEID,
- ):
- out = DWORD()
- if dbghelp.SymGetTypeInfo(self.process, self.mod_base, type_id, info_type.value, ctypes.byref(out)):
- return out.value
- elif info_type == IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VALUE:
- out = VARIANT()
- if dbghelp.SymGetTypeInfo(self.process, self.mod_base, type_id, info_type.value, ctypes.byref(out)):
- return _variant_to_python(out)
- elif info_type in (IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_ADDRESS,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GTIEX_REQS_VALID):
- out = ULONG64()
- if dbghelp.SymGetTypeInfo(self.process, self.mod_base, type_id, info_type.value, ctypes.byref(out)):
- return out.value
- elif info_type in (
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_CONST,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_VOLATILE,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_UNALIGNED,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_REFERENCE,
- IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VIRTUALBASECLASS,
- ):
- out = BOOL()
- if dbghelp.SymGetTypeInfo(self.process, self.mod_base, type_id, info_type.value, ctypes.byref(out)):
- return bool(out.value)
- elif info_type == IMAGEHLP_SYMBOL_TYPE_INFO.TI_FINDCHILDREN:
- count = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_CHILDRENCOUNT)
- if not count:
- return []
-
- # TI_FINDCHILDREN_PARAMS is a variable-sized structure.
- # We need to allocate enough space for Count, Start, and all ChildIds.
- size = ctypes.sizeof(TI_FINDCHILDREN_PARAMS) + (count - 1) * ctypes.sizeof(ULONG)
- buf = (ctypes.c_char * size)()
- params = ctypes.cast(buf, ctypes.POINTER(TI_FINDCHILDREN_PARAMS))
- params.contents.Count = count
- params.contents.Start = 0
-
- if dbghelp.SymGetTypeInfo(self.process, self.mod_base, type_id, info_type.value, params):
- # Access ChildId as an array of length 'count'
- child_ids = ctypes.cast(params.contents.ChildId, ctypes.POINTER(ULONG * count))
- return list(child_ids.contents)
- return None
-
- def _get_referenced_type_id(self, type_id: int) -> int | None:
- child_id = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_TYPEID)
- if child_id is None:
- child_id = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_TYPE)
- return child_id
-
- def _wrap_qualifiers(self, type_id: int, tp: symbols.TypeInfo) -> symbols.TypeInfo:
- if self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_CONST):
- tp = symbols.ConstantType(tp)
- if self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_VOLATILE):
- tp = symbols.VolatileType(tp)
- return tp
-
- def get_data(self, type_id: int) -> symbols.DataType | None:
- tag_val = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMTAG)
- if tag_val is None:
- return None
- tag = SymTagEnum(tag_val)
- if tag != SymTagEnum.SymTagData:
- return None
- name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME)
- tp = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_TYPE)
- base_type = self.get_full_type_name(tp) if tp is not None else symbols.UnspecifiedType("unknown")
- data_kind_value = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_DATAKIND)
- try:
- data_kind = DataKind(data_kind_value)
- except (TypeError, ValueError):
- data_kind = DataKind.DataIsUnknown
-
- value: Any = None
- if data_kind == DataKind.DataIsConstant:
- value = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VALUE)
- elif data_kind in (DataKind.DataIsGlobal, DataKind.DataIsStaticLocal, DataKind.DataIsFileStatic,
- DataKind.DataIsStaticMember):
- value = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_ADDRESS)
- elif data_kind in (DataKind.DataIsLocal, DataKind.DataIsParam, DataKind.DataIsObjectPtr, DataKind.DataIsMember):
- value = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_OFFSET)
- return symbols.DataType(name or "", value, base_type, data_kind)
-
- def get_enumerators(self, type_id: int) -> list[symbols.Enumerator]:
- tag_val = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMTAG)
- if tag_val is None:
- return []
- tag = SymTagEnum(tag_val)
- if tag != SymTagEnum.SymTagEnum:
- return []
- chs = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_FINDCHILDREN)
- if not chs:
- return []
- enumerators: list[symbols.Enumerator] = []
- for ch in chs:
- result = self.get_data(ch)
- if result is None:
- continue
- if isinstance(result.value, int):
- enumerators.append(symbols.Enumerator(result.name, result.value))
- return enumerators
-
- def get_struct(self, type_id: int) -> symbols.StructureType:
- name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME)
- byte_size = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH)
- chs = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_FINDCHILDREN)
- members: list[symbols.StructMember] = []
- for ch in chs or []:
- result = self.get_data(ch)
- if result is None:
- continue
- offset = result.value if isinstance(result.value, int) else 0
- members.append(symbols.StructMember(result.name, result.type, offset))
- return symbols.StructureType(name or "", int(byte_size or 0), members)
-
- def get_union(self, type_id: int) -> symbols.UnionType:
- name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME)
- byte_size = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH)
- chs = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_FINDCHILDREN)
- alternatives: list[symbols.StructMember] = []
- for ch in chs or []:
- result = self.get_data(ch)
- if result is None:
- continue
- alternatives.append(symbols.StructMember(result.name, result.type, 0))
- return symbols.UnionType(name or "", int(byte_size or 0), alternatives)
-
- def get_class(self, type_id: int) -> symbols.ClassType:
- name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME)
- byte_size = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH)
- chs = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_FINDCHILDREN)
- members: list[symbols.ClassMember] = []
- for ch in chs or []:
- result = self.get_data(ch)
- if result is None:
- continue
- offset = result.value if isinstance(result.value, int) else 0
- members.append(
- symbols.ClassMember(
- result.name,
- "",
- result.type,
- offset,
- 0,
- result.datakind == DataKind.DataIsStaticMember,
- )
- )
- return symbols.ClassType(name or "", int(byte_size or 0), members)
-
- def get_args(self, type_id: int) -> symbols.TypeInfo | None:
- tag_val = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMTAG)
- if tag_val is None:
- return None
- tag = SymTagEnum(tag_val)
- if tag != SymTagEnum.SymTagFunctionArgType:
- return None
- tp = self._get_referenced_type_id(type_id)
- if tp is None:
- return symbols.UnspecifiedType("unknown")
- return self.get_full_type_name(tp)
-
-
- def get_function(self, type_id: int) -> symbols.SubroutineType:
- name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME)
- chs = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_FINDCHILDREN)
- tid = self._get_referenced_type_id(type_id)
- ret_type: symbols.TypeInfo = symbols.UnspecifiedType("void")
- if tid is not None:
- ret_type = self.get_full_type_name(tid)
- args: list[symbols.TypeInfo] = []
- for ch in chs or []:
- arg = self.get_args(ch)
- if arg is not None:
- args.append(arg)
- return symbols.SubroutineType(name or "", 0, ret_type, args)
-
- def get_full_type_name(self, type_id: int | None) -> symbols.TypeInfo:
- if type_id is None:
- return symbols.UnspecifiedType("unknown")
- if type_id in self._type_cache:
- return self._type_cache[type_id]
- if type_id in self._resolving:
- return symbols.UnspecifiedType(f"recursive_type_{type_id}")
-
- self._resolving.add(type_id)
- try:
- resolved = self._resolve_type(type_id)
- self._type_cache[type_id] = resolved
- return resolved
- finally:
- self._resolving.discard(type_id)
-
- def _resolve_type(self, type_id: int) -> symbols.TypeInfo:
- tag_val = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMTAG)
- if tag_val is None:
- return symbols.UnspecifiedType("unknown")
-
- try:
- tag = SymTagEnum(tag_val)
- except ValueError:
- return symbols.UnspecifiedType(f"unknown_tag_{tag_val}")
-
- if tag == SymTagEnum.SymTagBaseType:
- bt = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_BASETYPE)
- length = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH)
- if bt is not None:
- try:
- base_type = BasicType(bt)
- except ValueError:
- base_type = bt
- type_name = PRIMITIVE_TYPEMAP.get(base_type, f"base_{bt}") if isinstance(base_type, BasicType) else f"base_{bt}"
- enc = symbols.type_encoding_from_pdb_bt(int(base_type))
- resolved = symbols.PrimitiveType(type_name, enc, int(length or 0))
- return self._wrap_qualifiers(type_id, resolved)
- enc = symbols.type_encoding_from_pdb_bt(int(BasicType.btVoid))
- return self._wrap_qualifiers(type_id, symbols.PrimitiveType("void", enc, int(length or 0)))
-
- elif tag == SymTagEnum.SymTagPointerType:
- child_id = self._get_referenced_type_id(type_id)
- is_ref = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_REFERENCE)
- full_type = self.get_full_type_name(child_id)
- if is_ref:
- resolved = symbols.ReferenceType(full_type)
- else:
- resolved = symbols.PointerType(full_type)
- return self._wrap_qualifiers(type_id, resolved)
-
- elif tag == SymTagEnum.SymTagArrayType:
- child_id = self._get_referenced_type_id(type_id)
- count = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_COUNT)
- full_type = self.get_full_type_name(child_id)
- byte_size = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH)
- if not count and byte_size and isinstance(full_type, symbols.PrimitiveType) and full_type.byte_size:
- count = int(byte_size // full_type.byte_size)
- if count is not None:
- resolved = symbols.ArrayType(full_type, [(0, int(count))])
- else:
- resolved = symbols.ArrayType(full_type, [(0, 0)])
- if isinstance(full_type, symbols.ArrayType):
- full_type.array_spec.insert(0, (0, count)) # coerce array-specifiers.
- return full_type
- else:
- return self._wrap_qualifiers(type_id, resolved)
- elif tag == SymTagEnum.SymTagEnum:
- name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME)
- tp = self._get_referenced_type_id(type_id)
- base_type = self.get_full_type_name(tp)
- byte_size = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH)
- enumerators = self.get_enumerators(type_id)
- encoding = base_type.encoding if isinstance(base_type, symbols.PrimitiveType) else None
- resolved = symbols.EnumerationType(name or "", int(byte_size or 0), encoding, base_type, enumerators)
- return self._wrap_qualifiers(type_id, resolved)
- elif tag == SymTagEnum.SymTagTypedef:
- name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME)
- tp = self._get_referenced_type_id(type_id)
- base_type = self.get_full_type_name(tp)
- resolved = symbols.TypeDefiniton(name or "", base_type)
- return self._wrap_qualifiers(type_id, resolved)
- elif tag == SymTagEnum.SymTagUDT:
- udt_kind = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_UDTKIND)
- if udt_kind == UdtKind.UdtStruct:
- resolved = self.get_struct(type_id)
- elif udt_kind == UdtKind.UdtUnion:
- resolved = self.get_union(type_id)
- elif udt_kind == UdtKind.UdtClass:
- resolved = self.get_class(type_id)
- else:
- name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME)
- resolved = symbols.UnspecifiedType(name or "udt")
- return self._wrap_qualifiers(type_id, resolved)
- elif tag == SymTagEnum.SymTagFunctionType:
- return self._wrap_qualifiers(type_id, self.get_function(type_id))
- else:
- return symbols.UnspecifiedType(tag.name.lstrip("SymTag"))
-
-
-class PdbSession:
- """Manages dbghelp.dll symbol session lifecycle.
-
- Context manager for dbghelp symbol operations. Handles initialization,
- module loading, symbol enumeration, and cleanup.
-
- The session maintains a dbghelp process handle and configures symbol
- search paths. Automatically loads modules and enables symbol options.
-
- Attributes:
- hproc: Process handle (from GetCurrentProcess)
- _modules: Dictionary of loaded module bases by path
-
- Usage:
- ```python
- # Basic session
- with PdbSession(symbol_path=[".", "C:\\Symbols"]) as pdb:
- # Enumerate symbols
- for sym in pdb.enum_symbols():
- print(f"{sym.name}: {sym.Address:#x}")
-
- # Load specific module
- pdb = PdbSession()
- try:
- base = pdb.load_module("app.exe")
- info = pdb.get_module_info()
- print(f"Module loaded at {base:#x}, size {info.size_of_image} bytes")
- finally:
- pdb.close()
- ```
-
- Symbol Options:
- The session automatically enables:
- - SYMOPT_DEFERRED_LOADS: Load symbols on demand
- - SYMOPT_UNDNAME: Undecorate C++ symbols
- - SYMOPT_LOAD_LINES: Load source line information
-
- Note:
- Always use context manager (with statement) or manually call close()
- to ensure proper cleanup of dbghelp resources.
- """
-
- def __init__(self, symbol_path: list[str] | None = None):
- """Initialize dbghelp symbol session.
-
- Args:
- symbol_path: Optional list of directories to search for symbols.
- Supports local paths and symbol servers:
- - Local: ["C:\\Symbols", "D:\\Debug"]
- - Server: ["SRV*C:\\Cache*https://msdl.microsoft.com/download/symbols"]
- If None, uses current directory and _NT_SYMBOL_PATH
-
- Raises:
- OSError: If not on Windows or SymInitialize fails
- """
- if not _WINDOWS:
- raise OSError("PDB support requires Windows (dbghelp.dll)")
-
- self.hproc = kernel32.GetCurrentProcess()
- if symbol_path:
- symbol_path_str = ";".join(symbol_path)
- else:
- symbol_path_str = None
-
- if not dbghelp.SymInitialize(self.hproc, symbol_path_str, True):
- raise OSError(f"SymInitialize failed, error={last_error()}")
-
- opts = dbghelp.SymGetOptions()
- opts |= SYMOPT_DEFERRED_LOADS | SYMOPT_UNDNAME | SYMOPT_LOAD_LINES
- dbghelp.SymSetOptions(opts)
- self.type_dumper_cache = {}
-
- @lru_cache
- def type_info(self, base: int, type_index: int) -> symbols.TypeInfo:
- if type_index:
- if base in self.type_dumper_cache:
- type_dumper = self.type_dumper_cache[base]
- else:
- type_dumper = CTypeInfoDump(self.hproc, base)
- self.type_dumper_cache[base] = type_dumper
- return type_dumper.get_full_type_name(type_index)
- return symbols.UnspecifiedType("unknown")
-
- def cleanup(self) -> None:
- """Cleans up the dbghelp session."""
- if _WINDOWS and hasattr(self, "hproc"):
- dbghelp.SymCleanup(self.hproc)
-
- def __enter__(self):
- return self
-
- def __exit__(self, exc_type, exc_val, exc_tb):
- self.cleanup()
-
- def set_search_path(self, search_path: str) -> None:
- """Sets the symbol search path for the current session."""
- if not dbghelp.SymSetSearchPath(self.hproc, search_path.encode("ascii")):
- raise OSError(f"SymSetSearchPath failed, error={last_error()}")
-
- def get_search_path(self) -> str:
- """Gets the symbol search path for the current session."""
- buffer = ctypes.create_string_buffer(2048)
- if not dbghelp.SymGetSearchPath(self.hproc, buffer, ctypes.sizeof(buffer)):
- raise OSError(f"SymGetSearchPath failed, error={last_error()}")
- return buffer.value.decode("ascii")
-
- def load_module(self, file_path: str) -> int:
- """Loads a module for the current session."""
- file_path = str(file_path)
- base = dbghelp.SymLoadModuleExW(self.hproc, None, file_path, None, 0, 0, None, 0)
- if base == 0:
- raise OSError(f"SymLoadModuleExW failed for {file_path}, error={last_error()}")
- return base
-
- def enum_symbols(self, base: int, pattern: bytes = b"*") -> list[dict]: # Generator[]:
- """Enumerates symbols in a loaded module."""
- results: list[dict] = []
-
- def _cb(pSymInfo, size, ctx):
- sym = pSymInfo.contents
- # Extract unused but potentially useful fields for debugging
- # name, addr, tag = sym.name, sym.Address, sym.tag
- # type_name = self.type_info(base, sym.TypeIndex)
- results.append(copy(sym))
- return True
-
- cb = PSYM_ENUMERATESYMBOLS_CALLBACK(_cb)
- if not dbghelp.SymEnumSymbols(self.hproc, base, pattern, cb, None):
- raise OSError(f"SymEnumSymbols failed, error={last_error()}")
- return results
-
- def sym_from_addr(self, addr: int):
- """Retrieves symbol information for the specified address."""
- displacement = ULONG64(0)
- info = SYMBOL_INFO()
- info.SizeOfStruct = ctypes.sizeof(SYMBOL_INFO)
- info.MaxNameLen = MAX_SYM_NAME
- if not dbghelp.SymFromAddr(self.hproc, ULONG64(addr), ctypes.byref(displacement), ctypes.byref(info)):
- raise OSError(f"SymFromAddr failed, error={last_error()}")
- return info.Name.decode(errors="ignore"), int(info.Address), int(displacement.value)
-
- def get_module_information(self, hmod: HANDLE) -> ModuleInfo:
- """Gets module information for the given module handle."""
- modinfo = MODULEINFO()
- if not psapi.GetModuleInformation(self.hproc, hmod, ctypes.byref(modinfo), ctypes.sizeof(modinfo)):
- raise OSError(f"GetModuleInformation failed, error={last_error()}")
- return ModuleInfo(modinfo.lpBaseOfDll, modinfo.SizeOfImage, modinfo.EntryPoint)
-
-
-def pdb_symbols_for_pe(pe_path: str, symbol_path: str | None = None) -> list[dict]:
- """Load PDB symbols for a PE file (high-level API).
-
- Convenience function that creates a PdbSession, loads the PE module,
- enumerates all symbols, and returns a list compatible with Pe_Symbol.
-
- Args:
- pe_path: Path to PE file (.exe, .dll, etc.)
- symbol_path: Optional symbol search path string.
- Supports semicolon-separated paths and symbol servers:
- - "C:\\Symbols;D:\\Debug"
- - "SRV*C:\\Cache*https://msdl.microsoft.com/download/symbols"
- If None, searches current directory
-
- Returns:
- List of symbol dictionaries with fields:
- - name (str): Symbol name
- - value (int): Symbol address (absolute VA)
- - section_number (int): Always 0 for PDB symbols
- - type (str): Type information if available
- - storage_class (int): Always 0 for PDB symbols
-
- Example:
- ```python
- # Basic usage
- symbols = pdb_symbols_for_pe("kernel32.dll")
- for sym in symbols:
- print(f"{sym['name']:40s} @ {sym['value']:#010x}")
-
- # With symbol server
- symbols = pdb_symbols_for_pe(
- "app.exe",
- "SRV*C:\\SymCache*https://msdl.microsoft.com/download/symbols"
- )
-
- # Filter functions only
- functions = [s for s in symbols if s.get('is_function', False)]
- ```
-
- Note:
- - Returns empty list if PDB not found or on non-Windows platforms
- - Symbol addresses are absolute (not RVAs)
- - Errors are caught and logged, returning empty list
- - For more control, use PdbSession directly
-
- Integration:
- This function is called automatically by objutils.pecoff.PeParser
- when COFF symbol table is empty.
- """
- if not _WINDOWS:
- return []
-
- try:
- with PdbSession(symbol_path if not symbol_path else [symbol_path]) as session:
- mod_base = session.load_module(pe_path)
- pdb_symbols = session.enum_symbols(mod_base, b"*")
- result = []
- for sym in pdb_symbols:
- if sym.tag != "SymTagData":
- continue
- ti = session.type_info(mod_base, sym.TypeIndex)
- print(f"{sym.tag:15}", sym.Name, hex(sym.Address - mod_base), sym.Size , sym.decode_flags(), "==>", ti)
- result.append(symbols.VariableType(sym.Name, ti, sym.Address - mod_base, sym.Size))
- return result
- """
- Name: Symbol name (null-terminated char array)
- Address: Absolute address in memory
- ModBase: Module base address
- Flags: Symbol flags (SymFlag enum values)
- Tag: Symbol tag type (SymTagEnum values)
- Size: Symbol size in bytes
- Value: Symbol value (for constants)
-
- Helper Methods:
- is_function(): True if symbol is a function
- is_export(): True if symbol is exported
- is_local(): True if symbol is local variable
- is_parameter(): True if symbol is function parameter
- decode_flags(): List of flag names
- """
-
- except (OSError, RuntimeError, ValueError) as e:
- print(f"Error: {str(e)}")
- return [] # Return an empty list in case of errors.
-
-
-def main(pe_path: str): # pragma: no cover - debug helper
- items = pdb_symbols_for_pe(pe_path)
- for it in items[:50]:
- print(f"{it['name']} : {it.get('type', 'unknown')} @ 0x{it['value']:016X}")
+#!/usr/bin/env python
+
+"""PDB debug symbol integration for PE/COFF files (Windows only).
+
+This module provides access to Microsoft Program Database (PDB) debug information
+using the Windows dbghelp.dll API. It enables comprehensive symbol lookup beyond
+the typically stripped COFF symbol table in release binaries.
+
+**Platform Support**: Windows only (requires dbghelp.dll)
+
+Overview:
+ PDB files contain rich debug information:
+
+ - **Symbols**: Function names, variables, constants
+ - **Types**: Structures, unions, enums, typedefs
+ - **Source Info**: File names, line numbers
+ - **Call Frames**: Stack unwinding data
+
+ ```
+ PE File + PDB:
+ ┌──────────────┐ ┌──────────────┐
+ │ app.exe │────>│ app.pdb │
+ │ │ │ │
+ │ Code │ │ - Symbols │
+ │ Data │ │ - Types │
+ │ (stripped) │ │ - Lines │
+ └──────────────┘ └──────────────┘
+ ```
+
+Architecture:
+ **Windows dbghelp.dll**:
+ - Microsoft's debug helper library
+ - Symbol server support
+ - Handles PDB loading and parsing
+ - Provides symbol enumeration API
+
+ **Symbol Enumeration**:
+ 1. Initialize dbghelp session (SymInitialize)
+ 2. Load PE module (SymLoadModuleExW)
+ 3. Set symbol search paths
+ 4. Enumerate symbols (SymEnumSymbolsA with callback)
+ 5. Extract type information (optional)
+ 6. Cleanup (SymCleanup)
+
+ **Type Information Extraction**:
+ - Uses dbghelp type info API (SymGetTypeInfo)
+ - Recursively resolves pointers, arrays, structs
+ - Extracts sizes, offsets, field names
+
+Usage Examples:
+ **Basic Symbol Extraction**:
+ ```python
+ from objutils.pecoff.pdb import pdb_symbols_for_pe
+
+ # Load symbols from PDB
+ symbols = pdb_symbols_for_pe("app.exe")
+
+ for sym in symbols:
+ print(f"{sym['name']:40s} @ {sym['address']:#010x}")
+ ```
+
+ **With Symbol Search Path**:
+ ```python
+ # Search multiple directories for PDB
+ symbols = pdb_symbols_for_pe(
+ "app.exe",
+ symbol_path="C:\\Symbols;SRV*C:\\SymCache*https://msdl.microsoft.com/download/symbols"
+ )
+ ```
+
+ **Advanced Session Management**:
+ ```python
+ from objutils.pecoff.pdb import PdbSession
+
+ with PdbSession("app.exe", symbol_path=[".", "C:\\Symbols"]) as pdb:
+ # Enumerate all symbols
+ for sym in pdb.enum_symbols():
+ if sym.is_function():
+ print(f"Function: {sym.name} @ {sym.Address:#x}")
+
+ # Get module info
+ info = pdb.get_module_info()
+ print(f"Module base: {info.base_of_dll:#x}")
+ ```
+
+ **Type Information Extraction**:
+ ```python
+ from objutils.pecoff.pdb import CTypeInfoDump
+
+ # Extract C type definitions
+ type_dumper = CTypeInfoDump(pdb_session.handle, base_address)
+ type_info = type_dumper.get_type_from_type_index(type_idx)
+ print(f"Type: {type_info['type_name']}, Size: {type_info['size']}")
+ ```
+
+Key Components:
+ **Enums**:
+ - **SymTagEnum**: Symbol tag types (function, data, UDT, etc.)
+ - **BasicType**: Primitive types (int, float, void, etc.)
+ - **SymFlag**: Symbol flags (export, local, function, etc.)
+ - **IMAGEHLP_SYMBOL_TYPE_INFO**: Type info query constants
+
+ **Data Classes**:
+ - **ModuleInfo**: Module metadata (base address, size, entry point)
+ - **SYMBOL_INFO**: Symbol information structure (ctypes)
+ - **MODULEINFO**: Windows API module info structure
+
+ **Core Classes**:
+ - **CTypeInfoDump**: Type information extraction and resolution
+ - **PdbSession**: Manages dbghelp.dll lifetime and operations
+
+dbghelp.dll API:
+ The module wraps these key dbghelp functions:
+
+ - **SymInitialize**: Initialize symbol handler
+ - **SymCleanup**: Cleanup symbol handler
+ - **SymLoadModuleExW**: Load module for symbol resolution
+ - **SymEnumSymbolsA**: Enumerate symbols with callback
+ - **SymGetTypeInfo**: Query type information
+ - **SymSetSearchPath/SymGetSearchPath**: Symbol path management
+
+Symbol Search Paths:
+ dbghelp supports flexible symbol search:
+
+ - **Local paths**: "C:\\Symbols;D:\\Debug"
+ - **Symbol servers**: "SRV*C:\\Cache*https://msdl.microsoft.com/download/symbols"
+ - **Combined**: "C:\\Local;SRV*C:\\Cache*https://server"
+
+ The `_NT_SYMBOL_PATH` environment variable is respected.
+
+Limitations:
+ - **Windows only**: Requires dbghelp.dll (unavailable on Linux/Mac)
+ - **PDB required**: Release binaries typically lack embedded COFF symbols
+ - **Architecture match**: PDB must match PE architecture (x86/x64)
+ - **Version match**: PDB should match PE build (GUID/age check)
+ - **Type info**: Complex recursive structures may have limitations
+
+Error Handling:
+ On non-Windows platforms, dbghelp/kernel32/psapi are set to None:
+
+ ```python
+ from objutils.pecoff.pdb import _WINDOWS
+
+ if not _WINDOWS:
+ print("PDB support unavailable (not Windows)")
+ ```
+
+ Import errors are caught and gracefully handled in __init__.py.
+
+See Also:
+ - objutils.pecoff: Main PE parser that uses this module
+ - objutils.pecoff.defs: PE/COFF constants
+ - objutils.elf.model: Similar ORM pattern for ELF
+ - Microsoft dbghelp.dll documentation
+ - PDB format specification
+
+Example Integration:
+ ```python
+ from objutils.pecoff import PeParser
+
+ # PeParser automatically attempts PDB loading
+ pe = PeParser("kernel32.dll", pdb_path=["C:\\Symbols"])
+
+ # Symbols now include PDB data if found
+ for sym in pe.symbols:
+ print(f"{sym['name']}: {sym['value']:#x}")
+ ```
+"""
+
+import ctypes
+import enum
+from copy import copy
+from ctypes import wintypes
+from dataclasses import dataclass
+from enum import IntEnum
+from functools import lru_cache
+from typing import Any, Optional
+
+# DLLs
+try:
+ dbghelp = ctypes.WinDLL("dbghelp") # type: ignore[attr-defined]
+ kernel32 = ctypes.WinDLL("kernel32") # type: ignore[attr-defined]
+ psapi = ctypes.WinDLL("psapi")
+ _WINDOWS = True
+except OSError: # pragma: no cover - non-Windows environment
+ dbghelp = None # type: ignore[assignment]
+ kernel32 = None # type: ignore[assignment]
+ psapi = None
+ _WINDOWS = False
+
+from objutils import symbols
+
+
+@dataclass
+class ModuleInfo:
+ """Module metadata extracted from Windows process.
+
+ Attributes:
+ base_of_dll: Base address where module is loaded in memory
+ size_of_image: Size of module in memory (bytes)
+ entry_point: Address of module entry point (or None)
+
+ Example:
+ ```python
+ info = pdb_session.get_module_info()
+ print(f"Module: {info.base_of_dll:#x} - {info.base_of_dll + info.size_of_image:#x}")
+ ```
+ """
+
+ base_of_dll: int
+ size_of_image: int
+ entry_point: Optional[int]
+
+
+# Types
+HANDLE = wintypes.HANDLE
+HLOCAL = wintypes.HANDLE
+DWORD = wintypes.DWORD
+ULONG = wintypes.ULONG
+ULONG64 = ctypes.c_ulonglong
+BOOL = wintypes.BOOL
+LPVOID = wintypes.LPVOID
+LPCWSTR = wintypes.LPCWSTR
+LPCSTR = wintypes.LPCSTR
+
+
+class VARTYPE(IntEnum):
+ """OLE Automation variant type discriminator (vt field of VARIANT)."""
+
+ VT_EMPTY = 0
+ VT_NULL = 1
+ VT_I2 = 2
+ VT_I4 = 3
+ VT_R4 = 4
+ VT_R8 = 5
+ VT_BSTR = 8
+ VT_BOOL = 11
+ VT_I1 = 16
+ VT_UI1 = 17
+ VT_UI2 = 18
+ VT_UI4 = 19
+ VT_I8 = 20
+ VT_UI8 = 21
+ VT_INT = 22
+ VT_UINT = 23
+
+
+class _VARIANT_VALUE(ctypes.Union):
+ """Inner value union of a COM VARIANT (covers numeric and pointer cases)."""
+
+ _fields_ = [
+ ("llVal", ctypes.c_longlong),
+ ("lVal", ctypes.c_long),
+ ("bVal", ctypes.c_ubyte),
+ ("iVal", ctypes.c_short),
+ ("fltVal", ctypes.c_float),
+ ("dblVal", ctypes.c_double),
+ ("boolVal", ctypes.c_short),
+ ("scode", ctypes.c_long),
+ ("cVal", ctypes.c_int8),
+ ("uiVal", ctypes.c_ushort),
+ ("ulVal", ctypes.c_ulong),
+ ("ullVal", ctypes.c_ulonglong),
+ ("intVal", ctypes.c_int),
+ ("uintVal", ctypes.c_uint),
+ ("byref", ctypes.c_void_p),
+ ]
+
+
+class VARIANT(ctypes.Structure):
+ """Minimal ctypes representation of the OLE Automation VARIANT structure.
+
+ The full COM VARIANT is a discriminated union keyed on the ``vt`` field.
+ Only the scalar numeric types that can appear as PDB constant values are
+ covered here; pointer/array/record sub-types are not needed.
+
+ Total size is 16 bytes (matching the Windows ABI definition).
+ """
+
+ _fields_ = [
+ ("vt", ctypes.c_ushort),
+ ("wReserved1", ctypes.c_ushort),
+ ("wReserved2", ctypes.c_ushort),
+ ("wReserved3", ctypes.c_ushort),
+ ("_value", _VARIANT_VALUE),
+ ]
+
+
+def _variant_to_python(variant):
+ """Convert a VARIANT value to an appropriate Python primitive.
+
+ Only the numeric VARTYPE values that are relevant for PDB constant symbols
+ are handled. Unknown or unsupported types return ``None``.
+ """
+ try:
+ kind = VARTYPE(variant.vt)
+ except ValueError:
+ return None
+ v = variant._value
+ _map = {
+ VARTYPE.VT_I1: lambda: int(v.cVal),
+ VARTYPE.VT_I2: lambda: int(v.iVal),
+ VARTYPE.VT_I4: lambda: int(v.lVal),
+ VARTYPE.VT_I8: lambda: int(v.llVal),
+ VARTYPE.VT_UI1: lambda: int(v.bVal),
+ VARTYPE.VT_UI2: lambda: int(v.uiVal),
+ VARTYPE.VT_UI4: lambda: int(v.ulVal),
+ VARTYPE.VT_UI8: lambda: int(v.ullVal),
+ VARTYPE.VT_INT: lambda: int(v.intVal),
+ VARTYPE.VT_UINT: lambda: int(v.uintVal),
+ VARTYPE.VT_R4: lambda: float(v.fltVal),
+ VARTYPE.VT_R8: lambda: float(v.dblVal),
+ VARTYPE.VT_BOOL: lambda: bool(v.boolVal),
+ }
+ fn = _map.get(kind)
+ return fn() if fn is not None else None
+
+
+class TI_FINDCHILDREN_PARAMS(ctypes.Structure):
+ _fields_ = [
+ ("Count", ULONG),
+ ("Start", ULONG),
+ ("ChildId", ULONG * 1),
+ ]
+
+
+# SYMBOL_INFO struct (ANSI)
+MAX_SYM_NAME = 2000
+
+
+class SYMBOL_INFO(ctypes.Structure):
+ """Windows API structure for symbol information.
+
+ Used with dbghelp.dll SymEnumSymbolsA to enumerate symbols.
+ Contains detailed information about a symbol including name, address,
+ flags, and type information.
+
+ Key Attributes:
+ Name: Symbol name (null-terminated char array)
+ Address: Absolute address in memory
+ ModBase: Module base address
+ Flags: Symbol flags (SymFlag enum values)
+ Tag: Symbol tag type (SymTagEnum values)
+ Size: Symbol size in bytes
+ Value: Symbol value (for constants)
+
+ Helper Methods:
+ is_function(): True if symbol is a function
+ is_export(): True if symbol is exported
+ is_local(): True if symbol is local variable
+ is_parameter(): True if symbol is function parameter
+ decode_flags(): List of flag names
+
+ Properties:
+ name: Decoded symbol name (str)
+ tag: Symbol tag name (str)
+ rel_address: Relative address (Address - ModBase)
+
+ Example:
+ ```python
+ # Used in enumeration callback
+ def callback(sym_info, size, context):
+ sym = ctypes.cast(sym_info, ctypes.POINTER(SYMBOL_INFO)).contents
+ if sym.is_function():
+ print(f"Function: {sym.name} @ {sym.Address:#x}")
+ return True # Continue enumeration
+ ```
+ """
+
+ def is_clr_token(self) -> bool:
+ """Check if symbol is a CLR token (.NET managed code)."""
+ return bool(self.Flags & SymFlag.SYMFLAG_CLR_TOKEN)
+ return bool(self.Flags & SymFlag.SYMFLAG_CLR_TOKEN)
+
+ def is_constant(self) -> bool:
+ """Check if symbol is a constant value."""
+ return bool(self.Flags & SymFlag.SYMFLAG_CONSTANT)
+
+ def is_export(self) -> bool:
+ """Check if symbol is exported from module."""
+ return bool(self.Flags & SymFlag.SYMFLAG_EXPORT)
+
+ def is_forwarder(self) -> bool:
+ """Check if symbol is an export forwarder."""
+ return bool(self.Flags & SymFlag.SYMFLAG_FORWARDER)
+
+ def is_framerel(self) -> bool:
+ """Check if symbol is frame-relative (stack variable)."""
+ return bool(self.Flags & SymFlag.SYMFLAG_FRAMEREL)
+
+ def is_function(self) -> bool:
+ """Check if symbol is a function."""
+ return bool(self.Flags & SymFlag.SYMFLAG_FUNCTION)
+
+ def is_ilrel(self) -> bool:
+ """Check if symbol is IL-relative (.NET managed code)."""
+ return bool(self.Flags & SymFlag.SYMFLAG_ILREL)
+
+ def is_local(self) -> bool:
+ """Check if symbol is a local variable."""
+ return bool(self.Flags & SymFlag.SYMFLAG_LOCAL)
+
+ def is_metadata(self) -> bool:
+ """Check if symbol is metadata."""
+ return bool(self.Flags & SymFlag.SYMFLAG_METADATA)
+
+ def is_parameter(self) -> bool:
+ """Check if symbol is a function parameter."""
+ return bool(self.Flags & SymFlag.SYMFLAG_PARAMETER)
+
+ def is_register(self) -> bool:
+ """Check if symbol is in a register."""
+ return bool(self.Flags & SymFlag.SYMFLAG_REGISTER)
+
+ def is_regrel(self) -> bool:
+ """Check if symbol is register-relative."""
+ return bool(self.Flags & SymFlag.SYMFLAG_REGREL)
+
+ def is_slot(self) -> bool:
+ """Check if symbol is a slot (.NET managed code)."""
+ return bool(self.Flags & SymFlag.SYMFLAG_SLOT)
+
+ def is_thunk(self) -> bool:
+ """Check if symbol is a thunk (jump stub)."""
+ return bool(self.Flags & SymFlag.SYMFLAG_THUNK)
+
+ def is_tlsrel(self) -> bool:
+ """Check if symbol is thread-local storage relative."""
+ return bool(self.Flags & SymFlag.SYMFLAG_TLSREL)
+
+ def is_value_present(self) -> bool:
+ """Check if symbol has value field populated."""
+ return bool(self.Flags & SymFlag.SYMFLAG_VALUEPRESENT)
+
+ def is_virtual(self) -> bool:
+ """Check if symbol is virtual."""
+ return bool(self.Flags & SymFlag.SYMFLAG_VIRTUAL)
+
+ # @cached_property
+ def decode_flags(self) -> list[str]:
+ """Decode Flags field to list of flag names.
+
+ Returns:
+ List of flag names (e.g., ["SYMFLAG_FUNCTION", "SYMFLAG_EXPORT"])
+ """
+ return [f.name for f in SymFlag if self.Flags & f.value]
+
+ # @cached_property
+ @property
+ def name(self):
+ """Get symbol name as decoded string.
+
+ Returns:
+ Symbol name (str), ignoring decode errors
+ """
+ return self.Name.decode(errors="ignore")
+
+ # @cached_property
+ @property
+ def tag(self):
+ """Get symbol tag name.
+
+ Returns:
+ Tag name (e.g., "SymTagFunction"), or "SymTagNull" if invalid
+ """
+ try:
+ return SymTagEnum(self.Tag).name
+ except ValueError:
+ return SymTagEnum.SymTagNull.name
+
+ # @cached_property
+ @property
+ def rel_address(self):
+ """Get symbol address relative to module base.
+
+ Returns:
+ Relative virtual address (RVA)
+ """
+ return self.Address - (0 if self.ModBase is None else self.ModBase)
+
+ def __repr__(self) -> str:
+ name = self.Name.decode(errors="ignore")
+ return f""
+
+ _fields_ = [
+ ("SizeOfStruct", ULONG),
+ ("TypeIndex", ULONG),
+ ("Reserved", ULONG64 * 2),
+ ("Index", ULONG),
+ ("Size", ULONG),
+ ("ModBase", ULONG64),
+ ("Flags", ULONG),
+ ("Value", ULONG64),
+ ("Address", ULONG64),
+ ("Register", ULONG),
+ ("Scope", ULONG),
+ ("Tag", ULONG),
+ ("NameLen", ULONG),
+ ("MaxNameLen", ULONG),
+ ("Name", ctypes.c_char * (MAX_SYM_NAME + 1)),
+ ]
+
+
+# MODULEINFO struct
+class MODULEINFO(ctypes.Structure):
+ """Windows API structure for module information (from psapi.dll).
+
+ Used with GetModuleInformation to query module metadata.
+ """
+
+ _fields_ = [
+ ("lpBaseOfDll", LPVOID),
+ ("SizeOfImage", DWORD),
+ ("EntryPoint", LPVOID),
+ ]
+
+
+class SymTagEnum(IntEnum):
+ """Symbol tag types for PDB symbols.
+
+ Defines the kind of symbol (function, data, type, etc.).
+ Used in SYMBOL_INFO.Tag field.
+
+ Common Values:
+ SymTagFunction (5): Function symbol
+ SymTagData (7): Variable symbol
+ SymTagPublicSymbol (10): Exported symbol
+ SymTagUDT (11): User-defined type (struct/class)
+ SymTagEnum (12): Enumeration type
+ SymTagPointerType (14): Pointer type
+ SymTagArrayType (15): Array type
+ SymTagBaseType (16): Primitive type
+ """
+
+ SymTagNull = 0
+ SymTagExe = 1
+ SymTagCompiland = 2
+ SymTagCompilandDetails = 3
+ SymTagCompilandEnv = 4
+ SymTagFunction = 5
+ SymTagBlock = 6
+ SymTagData = 7
+ SymTagAnnotation = 8
+ SymTagLabel = 9
+ SymTagPublicSymbol = 10
+ SymTagUDT = 11
+ SymTagEnum = 12
+ SymTagFunctionType = 13
+ SymTagPointerType = 14
+ SymTagArrayType = 15
+ SymTagBaseType = 16
+ SymTagTypedef = 17
+ SymTagBaseClass = 18
+ SymTagFriend = 19
+ SymTagFunctionArgType = 20
+ SymTagFuncDebugStart = 21
+ SymTagFuncDebugEnd = 22
+ SymTagUsingNamespace = 23
+ SymTagVTableShape = 24
+ SymTagVTable = 25
+ SymTagCustom = 26
+ SymTagThunk = 27
+ SymTagCustomType = 29
+ SymTagManagedType = 30
+ SymTagDimension = 31
+
+
+class UdtKind(enum.IntEnum):
+ UdtStruct = 0
+ UdtClass = 1
+ UdtUnion = 2
+
+
+class IMAGEHLP_SYMBOL_TYPE_INFO(IntEnum):
+ """Constants for SymGetTypeInfo queries.
+
+ Used with dbghelp.SymGetTypeInfo to query type information.
+
+ Common Queries:
+ TI_GET_SYMTAG (0): Get symbol tag
+ TI_GET_SYMNAME (1): Get symbol name
+ TI_GET_LENGTH (2): Get type size in bytes
+ TI_GET_TYPE (3): Get type index
+ TI_GET_BASETYPE (5): Get base type (BasicType enum)
+ TI_GET_CHILDRENCOUNT (13): Get count of child members
+ TI_GET_OFFSET (10): Get member offset in struct
+ """
+
+ TI_GET_SYMTAG = 0
+ TI_GET_SYMNAME = 1
+ TI_GET_LENGTH = 2
+ TI_GET_TYPE = 3
+ TI_GET_TYPEID = 4
+ TI_GET_BASETYPE = 5
+ TI_GET_ARRAYINDEXTYPEID = 6
+ TI_FINDCHILDREN = 7
+ TI_GET_DATAKIND = 8
+ TI_GET_ADDRESSOFFSET = 9
+ TI_GET_OFFSET = 10
+ TI_GET_VALUE = 11
+ TI_GET_COUNT = 12
+ TI_GET_CHILDRENCOUNT = 13
+ TI_GET_BITPOSITION = 14
+ TI_GET_VIRTUALBASECLASS = 15
+ TI_GET_VIRTUALTABLESHAPEID = 16
+ TI_GET_VIRTUALBASEPOINTEROFFSET = 17
+ TI_GET_CLASSTYPEID = 18
+ TI_GET_NESTED = 19
+ TI_GET_SYMINDEX = 20
+ TI_GET_LEXICALPARENT = 21
+ TI_GET_ADDRESS = 22
+ TI_GET_THISADJUST = 23
+ TI_GET_UDTKIND = 24
+ TI_IS_EQUIV_TO = 25
+ TI_GET_CALLING_CONVENTION = 26
+ TI_IS_CLOSE_EQUIV_TO = 27
+ TI_GTIEX_REQS_VALID = 28
+ TI_GET_VIRTUALBASEOFFSET = 29
+ TI_GET_VIRTUALBASEDISPINDEX = 30
+ TI_GET_IS_REFERENCE = 31
+ TI_GET_INDIRECTVIRTUALBASEDISPINDEX = 32
+ TI_GET_VIRTUALBASETABLETYPEID = 33
+ TI_GET_OBJECTPOINTERTYPEID = 34
+ TI_GET_IS_CONST = 35
+ TI_GET_IS_VOLATILE = 36
+ TI_GET_IS_UNALIGNED = 37
+
+
+class BasicType(IntEnum):
+ """Primitive type identifiers for PDB types.
+
+ Used with TI_GET_BASETYPE query to identify base types.
+
+ Common Types:
+ btVoid (1): void type
+ btChar (2): char type
+ btInt (6): signed integer
+ btUInt (7): unsigned integer
+ btFloat (8): floating point
+ btBool (10): boolean
+ btLong (13): long integer
+ btULong (14): unsigned long
+ """
+
+ btNoType = 0
+ btVoid = 1
+ btChar = 2
+ btWChar = 3
+ btInt = 6
+ btUInt = 7
+ btFloat = 8
+ btBCD = 9
+ btBool = 10
+ btLong = 13
+ btULong = 14
+ btCurrency = 25
+ btDate = 26
+ btVariant = 27
+ btComplex = 28
+ btBit = 29
+ btBSTR = 30
+ btHresult = 31
+ btChar16 = 32
+ btChar32 = 33
+ btChar8 = 34
+
+
+PRIMITIVE_TYPEMAP = {
+ BasicType.btNoType: "",
+ BasicType.btVoid: "void",
+ BasicType.btChar: "char",
+ BasicType.btWChar: "wchar",
+ BasicType.btInt: "signed int",
+ BasicType.btUInt: "unsigned int",
+ BasicType.btFloat: "float",
+ BasicType.btBCD: "BCD",
+ BasicType.btBool: "bool",
+ BasicType.btLong: "long",
+ BasicType.btULong: "unsigned long",
+ BasicType.btCurrency: "CURRENCY",
+ BasicType.btDate: "DATE",
+ BasicType.btVariant: "VARIANT",
+ BasicType.btComplex: "complex",
+ BasicType.btBit: "BIT",
+ BasicType.btBSTR: "BSTR",
+ BasicType.btHresult: "HRESULT",
+ BasicType.btChar16: "Char16",
+ BasicType.btChar32: "Char32",
+ BasicType.btChar8: "Char8",
+}
+
+
+class DataKind(enum.IntEnum):
+ DataIsUnknown = 0
+ DataIsLocal = 1
+ DataIsStaticLocal = 2
+ DataIsParam = 3
+ DataIsObjectPtr = 4
+ DataIsFileStatic = 5
+ DataIsGlobal = 6
+ DataIsMember = 7
+ DataIsStaticMember = 8
+ DataIsConstant = 9
+
+
+class SymFlag(IntEnum):
+ """Symbol flags for SYMBOL_INFO.Flags field.
+
+ Bit flags indicating symbol properties.
+
+ Common Flags:
+ SYMFLAG_FUNCTION (0x800): Symbol is a function
+ SYMFLAG_EXPORT (0x200): Symbol is exported
+ SYMFLAG_LOCAL (0x80): Symbol is local variable
+ SYMFLAG_PARAMETER (0x40): Symbol is function parameter
+ SYMFLAG_REGISTER (0x8): Symbol is in register
+ SYMFLAG_CONSTANT (0x100): Symbol is a constant
+ SYMFLAG_VALUEPRESENT (0x1): Value field is valid
+ """
+
+ SYMFLAG_VALUEPRESENT = 0x00000001
+ SYMFLAG_REGISTER = 0x00000008
+ SYMFLAG_REGREL = 0x00000010
+ SYMFLAG_FRAMEREL = 0x00000020
+ SYMFLAG_PARAMETER = 0x00000040
+ SYMFLAG_LOCAL = 0x00000080
+ SYMFLAG_CONSTANT = 0x00000100
+ SYMFLAG_EXPORT = 0x00000200
+ SYMFLAG_FORWARDER = 0x00000400
+ SYMFLAG_FUNCTION = 0x00000800
+ SYMFLAG_VIRTUAL = 0x00001000
+ SYMFLAG_THUNK = 0x00002000
+ SYMFLAG_TLSREL = 0x00004000
+ SYMFLAG_SLOT = 0x00008000
+ SYMFLAG_ILREL = 0x00010000
+ SYMFLAG_METADATA = 0x00020000
+ SYMFLAG_CLR_TOKEN = 0x00040000
+
+
+# Prototypes
+if _WINDOWS:
+ dbghelp.SymInitialize.argtypes = [HANDLE, LPCWSTR, BOOL]
+ dbghelp.SymInitialize.restype = BOOL
+
+ dbghelp.SymCleanup.argtypes = [HANDLE]
+ dbghelp.SymCleanup.restype = BOOL
+
+ dbghelp.SymSetOptions.argtypes = [DWORD]
+ dbghelp.SymSetOptions.restype = DWORD
+
+ dbghelp.SymGetOptions.argtypes = []
+ dbghelp.SymGetOptions.restype = DWORD
+
+ dbghelp.SymLoadModuleExW.argtypes = [HANDLE, HANDLE, LPCWSTR, LPCWSTR, ULONG64, DWORD, LPVOID, DWORD]
+ dbghelp.SymLoadModuleExW.restype = ULONG64 # returns base
+
+ dbghelp.SymSetSearchPath.argtypes = [HANDLE, LPCSTR]
+ dbghelp.SymSetSearchPath.restype = BOOL
+
+ dbghelp.SymGetSearchPath.argtypes = [HANDLE, ctypes.c_char_p, DWORD]
+ dbghelp.SymGetSearchPath.restype = BOOL
+
+# SymEnumSymbolsA callback and function
+if _WINDOWS:
+ PSYM_ENUMERATESYMBOLS_CALLBACK = ctypes.WINFUNCTYPE(
+ BOOL,
+ ctypes.POINTER(SYMBOL_INFO),
+ ULONG,
+ LPVOID,
+ )
+
+if _WINDOWS:
+ dbghelp.SymEnumSymbols.argtypes = [HANDLE, ULONG64, LPCSTR, PSYM_ENUMERATESYMBOLS_CALLBACK, LPVOID]
+ dbghelp.SymEnumSymbols.restype = BOOL
+
+# SymFromAddr
+if _WINDOWS:
+ dbghelp.SymFromAddr.argtypes = [HANDLE, ULONG64, ctypes.POINTER(ULONG64), ctypes.POINTER(SYMBOL_INFO)]
+ dbghelp.SymFromAddr.restype = BOOL
+
+ dbghelp.SymGetTypeInfo.argtypes = [HANDLE, ULONG64, ULONG, ctypes.c_int, LPVOID]
+ dbghelp.SymGetTypeInfo.restype = BOOL
+
+# Kernel32 helpers
+if _WINDOWS:
+ kernel32.GetCurrentProcess.restype = HANDLE
+ psapi.GetModuleInformation.argtypes = [HANDLE, HANDLE, ctypes.POINTER(MODULEINFO), DWORD]
+ psapi.GetModuleInformation.restype = BOOL
+ kernel32.GetLastError.restype = DWORD
+ kernel32.LoadLibraryA.argtypes = [LPCSTR]
+ kernel32.LoadLibraryA.restype = HANDLE
+ kernel32.FreeLibrary.argtypes = [HANDLE]
+ kernel32.FreeLibrary.restype = BOOL
+ kernel32.LocalFree.argtypes = [HLOCAL]
+ kernel32.LocalFree.restype = HLOCAL
+
+
+def last_error():
+ if not _WINDOWS:
+ return 0
+ return kernel32.GetLastError()
+
+
+# SYMOPT flags (subset)
+SYMOPT_DEFERRED_LOADS = 0x00000004
+SYMOPT_UNDNAME = 0x00000002
+SYMOPT_LOAD_LINES = 0x00000010
+
+
+def load_library(lib_path: str) -> HANDLE:
+ """Loads the specified module into the address space of the calling process."""
+ if not _WINDOWS:
+ raise OSError("PDB support requires Windows (kernel32.dll)")
+ handle = kernel32.LoadLibraryA(lib_path.encode("ascii"))
+ if not handle:
+ raise OSError(f"LoadLibraryA failed for {lib_path}, error={last_error()}")
+ return handle
+
+
+def free_library(hmod: HANDLE) -> None:
+ """Frees the loaded dynamic-link library (DLL) module."""
+ if not _WINDOWS:
+ raise OSError("PDB support requires Windows (kernel32.dll)")
+ if not kernel32.FreeLibrary(hmod):
+ raise OSError(f"FreeLibrary failed, error={last_error()}")
+
+
+class CTypeInfoDump:
+ """Extracts C type information from PDB debug symbols.
+
+ Recursively resolves type definitions including pointers, arrays,
+ structures, unions, and base types. Uses dbghelp.SymGetTypeInfo
+ to query type metadata.
+
+ Attributes:
+ process: dbghelp process handle
+ mod_base: Module base address
+
+ Type Resolution Algorithm:
+ 1. Query type tag (pointer, array, UDT, base type, etc.)
+ 2. For compound types:
+ - Pointer: Resolve pointed-to type
+ - Array: Resolve element type and count
+ - UDT: Enumerate members recursively
+ 3. Calculate sizes and offsets
+ 4. Build type dictionary with metadata
+
+ Example:
+ ```python
+ type_dumper = CTypeInfoDump(pdb_session.handle, base_address)
+
+ # Get type info for a symbol
+ type_info = type_dumper.get_type_from_type_index(type_idx)
+ print(f"Type: {type_info['type_name']}")
+ print(f"Size: {type_info['size']} bytes")
+
+ # For struct, enumerate members
+ if 'members' in type_info:
+ for member in type_info['members']:
+ print(f" {member['name']}: {member['type_name']} @ +{member['offset']}")
+ ```
+
+ Note:
+ Type resolution can be slow for complex recursive structures.
+ Use caching when querying multiple symbols.
+ """
+
+ def __init__(self, process, mod_base):
+ """Initialize type info dumper.
+
+ Args:
+ process: dbghelp process handle from PdbSession
+ mod_base: Module base address
+ """
+ self.process = process
+ self.mod_base = mod_base
+ self._type_cache: dict[int, symbols.TypeInfo] = {}
+ self._resolving: set[int] = set()
+
+ def get_type_info(self, type_id, info_type):
+ """Query type information from dbghelp.
+
+ Args:
+ type_id: Type index to query
+ info_type: IMAGEHLP_SYMBOL_TYPE_INFO constant
+
+ Returns:
+ Type information value (type depends on info_type):
+ - String for TI_GET_SYMNAME
+ - Integer for TI_GET_LENGTH, TI_GET_COUNT, etc.
+ - Boolean for TI_GET_IS_CONST, TI_GET_IS_VOLATILE, etc.
+ - None if query fails
+
+ Note:
+ Different info_type values return different data types.
+ Memory for strings (TI_GET_SYMNAME) is automatically freed.
+ """
+ if not _WINDOWS:
+ return None
+ if info_type in (IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME,):
+ ptr = ctypes.c_void_p()
+ if dbghelp.SymGetTypeInfo(self.process, self.mod_base, type_id, info_type.value, ctypes.byref(ptr)):
+ if ptr.value:
+ name = ctypes.wstring_at(ptr)
+ kernel32.LocalFree(ptr)
+ return name
+ elif info_type in (
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMTAG,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_BASETYPE,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_TYPE,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_TYPEID,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_ARRAYINDEXTYPEID,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_COUNT,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_CHILDRENCOUNT,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_OFFSET,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_DATAKIND,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_UDTKIND,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_ADDRESSOFFSET,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_BITPOSITION,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VIRTUALTABLESHAPEID,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VIRTUALBASEPOINTEROFFSET,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_CLASSTYPEID,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_NESTED,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMINDEX,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LEXICALPARENT,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_THISADJUST,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_IS_EQUIV_TO,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_CALLING_CONVENTION,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_IS_CLOSE_EQUIV_TO,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VIRTUALBASEOFFSET,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VIRTUALBASEDISPINDEX,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_OBJECTPOINTERTYPEID,
+ ):
+ out = DWORD()
+ if dbghelp.SymGetTypeInfo(self.process, self.mod_base, type_id, info_type.value, ctypes.byref(out)):
+ return out.value
+ elif info_type == IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VALUE:
+ out = VARIANT()
+ if dbghelp.SymGetTypeInfo(self.process, self.mod_base, type_id, info_type.value, ctypes.byref(out)):
+ return _variant_to_python(out)
+ elif info_type in (
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_ADDRESS,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GTIEX_REQS_VALID,
+ ):
+ out = ULONG64()
+ if dbghelp.SymGetTypeInfo(self.process, self.mod_base, type_id, info_type.value, ctypes.byref(out)):
+ return out.value
+ elif info_type in (
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_CONST,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_VOLATILE,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_UNALIGNED,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_REFERENCE,
+ IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VIRTUALBASECLASS,
+ ):
+ out = BOOL()
+ if dbghelp.SymGetTypeInfo(self.process, self.mod_base, type_id, info_type.value, ctypes.byref(out)):
+ return bool(out.value)
+ elif info_type == IMAGEHLP_SYMBOL_TYPE_INFO.TI_FINDCHILDREN:
+ count = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_CHILDRENCOUNT)
+ if not count:
+ return []
+
+ # TI_FINDCHILDREN_PARAMS is a variable-sized structure.
+ # We need to allocate enough space for Count, Start, and all ChildIds.
+ size = ctypes.sizeof(TI_FINDCHILDREN_PARAMS) + (count - 1) * ctypes.sizeof(ULONG)
+ buf = (ctypes.c_char * size)()
+ params = ctypes.cast(buf, ctypes.POINTER(TI_FINDCHILDREN_PARAMS))
+ params.contents.Count = count
+ params.contents.Start = 0
+
+ if dbghelp.SymGetTypeInfo(self.process, self.mod_base, type_id, info_type.value, params):
+ # Access ChildId as an array of length 'count'
+ child_ids = ctypes.cast(params.contents.ChildId, ctypes.POINTER(ULONG * count))
+ return list(child_ids.contents)
+ return None
+
+ def _get_referenced_type_id(self, type_id: int) -> int | None:
+ child_id = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_TYPEID)
+ if child_id is None:
+ child_id = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_TYPE)
+ return child_id
+
+ def _wrap_qualifiers(self, type_id: int, tp: symbols.TypeInfo) -> symbols.TypeInfo:
+ if self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_CONST):
+ tp = symbols.ConstantType(tp)
+ if self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_VOLATILE):
+ tp = symbols.VolatileType(tp)
+ return tp
+
+ def get_data(self, type_id: int) -> symbols.DataType | None:
+ tag_val = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMTAG)
+ if tag_val is None:
+ return None
+ tag = SymTagEnum(tag_val)
+ if tag != SymTagEnum.SymTagData:
+ return None
+ name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME)
+ tp = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_TYPE)
+ base_type = self.get_full_type_name(tp) if tp is not None else symbols.UnspecifiedType("unknown")
+ data_kind_value = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_DATAKIND)
+ try:
+ data_kind = DataKind(data_kind_value)
+ except (TypeError, ValueError):
+ data_kind = DataKind.DataIsUnknown
+
+ value: Any = None
+ if data_kind == DataKind.DataIsConstant:
+ value = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_VALUE)
+ elif data_kind in (
+ DataKind.DataIsGlobal,
+ DataKind.DataIsStaticLocal,
+ DataKind.DataIsFileStatic,
+ DataKind.DataIsStaticMember,
+ ):
+ value = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_ADDRESS)
+ elif data_kind in (DataKind.DataIsLocal, DataKind.DataIsParam, DataKind.DataIsObjectPtr, DataKind.DataIsMember):
+ value = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_OFFSET)
+ return symbols.DataType(name or "", value, base_type, data_kind)
+
+ def get_enumerators(self, type_id: int) -> list[symbols.Enumerator]:
+ tag_val = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMTAG)
+ if tag_val is None:
+ return []
+ tag = SymTagEnum(tag_val)
+ if tag != SymTagEnum.SymTagEnum:
+ return []
+ chs = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_FINDCHILDREN)
+ if not chs:
+ return []
+ enumerators: list[symbols.Enumerator] = []
+ for ch in chs:
+ result = self.get_data(ch)
+ if result is None:
+ continue
+ if isinstance(result.value, int):
+ enumerators.append(symbols.Enumerator(result.name, result.value))
+ return enumerators
+
+ def get_struct(self, type_id: int) -> symbols.StructureType:
+ name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME)
+ byte_size = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH)
+ chs = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_FINDCHILDREN)
+ members: list[symbols.StructMember] = []
+ for ch in chs or []:
+ result = self.get_data(ch)
+ if result is None:
+ continue
+ offset = result.value if isinstance(result.value, int) else 0
+ members.append(symbols.StructMember(result.name, result.type, offset))
+ return symbols.StructureType(name or "", int(byte_size or 0), members)
+
+ def get_union(self, type_id: int) -> symbols.UnionType:
+ name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME)
+ byte_size = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH)
+ chs = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_FINDCHILDREN)
+ alternatives: list[symbols.StructMember] = []
+ for ch in chs or []:
+ result = self.get_data(ch)
+ if result is None:
+ continue
+ alternatives.append(symbols.StructMember(result.name, result.type, 0))
+ return symbols.UnionType(name or "", int(byte_size or 0), alternatives)
+
+ def get_class(self, type_id: int) -> symbols.ClassType:
+ name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME)
+ byte_size = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH)
+ chs = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_FINDCHILDREN)
+ members: list[symbols.ClassMember] = []
+ for ch in chs or []:
+ result = self.get_data(ch)
+ if result is None:
+ continue
+ offset = result.value if isinstance(result.value, int) else 0
+ members.append(
+ symbols.ClassMember(
+ result.name,
+ "",
+ result.type,
+ offset,
+ 0,
+ result.datakind == DataKind.DataIsStaticMember,
+ )
+ )
+ return symbols.ClassType(name or "", int(byte_size or 0), members)
+
+ def get_args(self, type_id: int) -> symbols.TypeInfo | None:
+ tag_val = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMTAG)
+ if tag_val is None:
+ return None
+ tag = SymTagEnum(tag_val)
+ if tag != SymTagEnum.SymTagFunctionArgType:
+ return None
+ tp = self._get_referenced_type_id(type_id)
+ if tp is None:
+ return symbols.UnspecifiedType("unknown")
+ return self.get_full_type_name(tp)
+
+ def get_function(self, type_id: int) -> symbols.SubroutineType:
+ name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME)
+ chs = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_FINDCHILDREN)
+ tid = self._get_referenced_type_id(type_id)
+ ret_type: symbols.TypeInfo = symbols.UnspecifiedType("void")
+ if tid is not None:
+ ret_type = self.get_full_type_name(tid)
+ args: list[symbols.TypeInfo] = []
+ for ch in chs or []:
+ arg = self.get_args(ch)
+ if arg is not None:
+ args.append(arg)
+ return symbols.SubroutineType(name or "", 0, ret_type, args)
+
+ def get_full_type_name(self, type_id: int | None) -> symbols.TypeInfo:
+ if type_id is None:
+ return symbols.UnspecifiedType("unknown")
+ if type_id in self._type_cache:
+ return self._type_cache[type_id]
+ if type_id in self._resolving:
+ return symbols.UnspecifiedType(f"recursive_type_{type_id}")
+
+ self._resolving.add(type_id)
+ try:
+ resolved = self._resolve_type(type_id)
+ self._type_cache[type_id] = resolved
+ return resolved
+ finally:
+ self._resolving.discard(type_id)
+
+ def _resolve_type(self, type_id: int) -> symbols.TypeInfo:
+ tag_val = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMTAG)
+ if tag_val is None:
+ return symbols.UnspecifiedType("unknown")
+
+ try:
+ tag = SymTagEnum(tag_val)
+ except ValueError:
+ return symbols.UnspecifiedType(f"unknown_tag_{tag_val}")
+
+ if tag == SymTagEnum.SymTagBaseType:
+ bt = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_BASETYPE)
+ length = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH)
+ if bt is not None:
+ try:
+ base_type = BasicType(bt)
+ except ValueError:
+ base_type = bt
+ type_name = PRIMITIVE_TYPEMAP.get(base_type, f"base_{bt}") if isinstance(base_type, BasicType) else f"base_{bt}"
+ enc = symbols.type_encoding_from_pdb_bt(int(base_type))
+ resolved = symbols.PrimitiveType(type_name, enc, int(length or 0))
+ return self._wrap_qualifiers(type_id, resolved)
+ enc = symbols.type_encoding_from_pdb_bt(int(BasicType.btVoid))
+ return self._wrap_qualifiers(type_id, symbols.PrimitiveType("void", enc, int(length or 0)))
+
+ elif tag == SymTagEnum.SymTagPointerType:
+ child_id = self._get_referenced_type_id(type_id)
+ is_ref = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_IS_REFERENCE)
+ full_type = self.get_full_type_name(child_id)
+ if is_ref:
+ resolved = symbols.ReferenceType(full_type)
+ else:
+ resolved = symbols.PointerType(full_type)
+ return self._wrap_qualifiers(type_id, resolved)
+
+ elif tag == SymTagEnum.SymTagArrayType:
+ child_id = self._get_referenced_type_id(type_id)
+ count = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_COUNT)
+ full_type = self.get_full_type_name(child_id)
+ byte_size = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH)
+ if not count and byte_size and isinstance(full_type, symbols.PrimitiveType) and full_type.byte_size:
+ count = int(byte_size // full_type.byte_size)
+ if count is not None:
+ resolved = symbols.ArrayType(full_type, [(0, int(count))])
+ else:
+ resolved = symbols.ArrayType(full_type, [(0, 0)])
+ if isinstance(full_type, symbols.ArrayType):
+ full_type.array_spec.insert(0, (0, count)) # coerce array-specifiers.
+ return full_type
+ else:
+ return self._wrap_qualifiers(type_id, resolved)
+ elif tag == SymTagEnum.SymTagEnum:
+ name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME)
+ tp = self._get_referenced_type_id(type_id)
+ base_type = self.get_full_type_name(tp)
+ byte_size = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_LENGTH)
+ enumerators = self.get_enumerators(type_id)
+ encoding = base_type.encoding if isinstance(base_type, symbols.PrimitiveType) else None
+ resolved = symbols.EnumerationType(name or "", int(byte_size or 0), encoding, base_type, enumerators)
+ return self._wrap_qualifiers(type_id, resolved)
+ elif tag == SymTagEnum.SymTagTypedef:
+ name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME)
+ tp = self._get_referenced_type_id(type_id)
+ base_type = self.get_full_type_name(tp)
+ resolved = symbols.TypeDefiniton(name or "", base_type)
+ return self._wrap_qualifiers(type_id, resolved)
+ elif tag == SymTagEnum.SymTagUDT:
+ udt_kind = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_UDTKIND)
+ if udt_kind == UdtKind.UdtStruct:
+ resolved = self.get_struct(type_id)
+ elif udt_kind == UdtKind.UdtUnion:
+ resolved = self.get_union(type_id)
+ elif udt_kind == UdtKind.UdtClass:
+ resolved = self.get_class(type_id)
+ else:
+ name = self.get_type_info(type_id, IMAGEHLP_SYMBOL_TYPE_INFO.TI_GET_SYMNAME)
+ resolved = symbols.UnspecifiedType(name or "udt")
+ return self._wrap_qualifiers(type_id, resolved)
+ elif tag == SymTagEnum.SymTagFunctionType:
+ return self._wrap_qualifiers(type_id, self.get_function(type_id))
+ else:
+ return symbols.UnspecifiedType(tag.name.lstrip("SymTag"))
+
+
+class PdbSession:
+ """Manages dbghelp.dll symbol session lifecycle.
+
+ Context manager for dbghelp symbol operations. Handles initialization,
+ module loading, symbol enumeration, and cleanup.
+
+ The session maintains a dbghelp process handle and configures symbol
+ search paths. Automatically loads modules and enables symbol options.
+
+ Attributes:
+ hproc: Process handle (from GetCurrentProcess)
+ _modules: Dictionary of loaded module bases by path
+
+ Usage:
+ ```python
+ # Basic session
+ with PdbSession(symbol_path=[".", "C:\\Symbols"]) as pdb:
+ # Enumerate symbols
+ for sym in pdb.enum_symbols():
+ print(f"{sym.name}: {sym.Address:#x}")
+
+ # Load specific module
+ pdb = PdbSession()
+ try:
+ base = pdb.load_module("app.exe")
+ info = pdb.get_module_info()
+ print(f"Module loaded at {base:#x}, size {info.size_of_image} bytes")
+ finally:
+ pdb.close()
+ ```
+
+ Symbol Options:
+ The session automatically enables:
+ - SYMOPT_DEFERRED_LOADS: Load symbols on demand
+ - SYMOPT_UNDNAME: Undecorate C++ symbols
+ - SYMOPT_LOAD_LINES: Load source line information
+
+ Note:
+ Always use context manager (with statement) or manually call close()
+ to ensure proper cleanup of dbghelp resources.
+ """
+
+ def __init__(self, symbol_path: list[str] | None = None):
+ """Initialize dbghelp symbol session.
+
+ Args:
+ symbol_path: Optional list of directories to search for symbols.
+ Supports local paths and symbol servers:
+ - Local: ["C:\\Symbols", "D:\\Debug"]
+ - Server: ["SRV*C:\\Cache*https://msdl.microsoft.com/download/symbols"]
+ If None, uses current directory and _NT_SYMBOL_PATH
+
+ Raises:
+ OSError: If not on Windows or SymInitialize fails
+ """
+ if not _WINDOWS:
+ raise OSError("PDB support requires Windows (dbghelp.dll)")
+
+ self.hproc = kernel32.GetCurrentProcess()
+ if symbol_path:
+ symbol_path_str = ";".join(symbol_path)
+ else:
+ symbol_path_str = None
+
+ if not dbghelp.SymInitialize(self.hproc, symbol_path_str, True):
+ raise OSError(f"SymInitialize failed, error={last_error()}")
+
+ opts = dbghelp.SymGetOptions()
+ opts |= SYMOPT_DEFERRED_LOADS | SYMOPT_UNDNAME | SYMOPT_LOAD_LINES
+ dbghelp.SymSetOptions(opts)
+ self.type_dumper_cache = {}
+
+ @lru_cache
+ def type_info(self, base: int, type_index: int) -> symbols.TypeInfo:
+ if type_index:
+ if base in self.type_dumper_cache:
+ type_dumper = self.type_dumper_cache[base]
+ else:
+ type_dumper = CTypeInfoDump(self.hproc, base)
+ self.type_dumper_cache[base] = type_dumper
+ return type_dumper.get_full_type_name(type_index)
+ return symbols.UnspecifiedType("unknown")
+
+ def cleanup(self) -> None:
+ """Cleans up the dbghelp session."""
+ if _WINDOWS and hasattr(self, "hproc"):
+ dbghelp.SymCleanup(self.hproc)
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ self.cleanup()
+
+ def set_search_path(self, search_path: str) -> None:
+ """Sets the symbol search path for the current session."""
+ if not dbghelp.SymSetSearchPath(self.hproc, search_path.encode("ascii")):
+ raise OSError(f"SymSetSearchPath failed, error={last_error()}")
+
+ def get_search_path(self) -> str:
+ """Gets the symbol search path for the current session."""
+ buffer = ctypes.create_string_buffer(2048)
+ if not dbghelp.SymGetSearchPath(self.hproc, buffer, ctypes.sizeof(buffer)):
+ raise OSError(f"SymGetSearchPath failed, error={last_error()}")
+ return buffer.value.decode("ascii")
+
+ def load_module(self, file_path: str) -> int:
+ """Loads a module for the current session."""
+ file_path = str(file_path)
+ base = dbghelp.SymLoadModuleExW(self.hproc, None, file_path, None, 0, 0, None, 0)
+ if base == 0:
+ raise OSError(f"SymLoadModuleExW failed for {file_path}, error={last_error()}")
+ return base
+
+ def enum_symbols(self, base: int, pattern: bytes = b"*") -> list[dict]: # Generator[]:
+ """Enumerates symbols in a loaded module."""
+ results: list[dict] = []
+
+ def _cb(pSymInfo, size, ctx):
+ sym = pSymInfo.contents
+ # Extract unused but potentially useful fields for debugging
+ # name, addr, tag = sym.name, sym.Address, sym.tag
+ # type_name = self.type_info(base, sym.TypeIndex)
+ results.append(copy(sym))
+ return True
+
+ cb = PSYM_ENUMERATESYMBOLS_CALLBACK(_cb)
+ if not dbghelp.SymEnumSymbols(self.hproc, base, pattern, cb, None):
+ raise OSError(f"SymEnumSymbols failed, error={last_error()}")
+ return results
+
+ def sym_from_addr(self, addr: int):
+ """Retrieves symbol information for the specified address."""
+ displacement = ULONG64(0)
+ info = SYMBOL_INFO()
+ info.SizeOfStruct = ctypes.sizeof(SYMBOL_INFO)
+ info.MaxNameLen = MAX_SYM_NAME
+ if not dbghelp.SymFromAddr(self.hproc, ULONG64(addr), ctypes.byref(displacement), ctypes.byref(info)):
+ raise OSError(f"SymFromAddr failed, error={last_error()}")
+ return info.Name.decode(errors="ignore"), int(info.Address), int(displacement.value)
+
+ def get_module_information(self, hmod: HANDLE) -> ModuleInfo:
+ """Gets module information for the given module handle."""
+ modinfo = MODULEINFO()
+ if not psapi.GetModuleInformation(self.hproc, hmod, ctypes.byref(modinfo), ctypes.sizeof(modinfo)):
+ raise OSError(f"GetModuleInformation failed, error={last_error()}")
+ return ModuleInfo(modinfo.lpBaseOfDll, modinfo.SizeOfImage, modinfo.EntryPoint)
+
+
+def pdb_symbols_for_pe(pe_path: str, symbol_path: str | None = None) -> list[dict]:
+ """Load PDB symbols for a PE file (high-level API).
+
+ Convenience function that creates a PdbSession, loads the PE module,
+ enumerates all symbols, and returns a list compatible with Pe_Symbol.
+
+ Args:
+ pe_path: Path to PE file (.exe, .dll, etc.)
+ symbol_path: Optional symbol search path string.
+ Supports semicolon-separated paths and symbol servers:
+ - "C:\\Symbols;D:\\Debug"
+ - "SRV*C:\\Cache*https://msdl.microsoft.com/download/symbols"
+ If None, searches current directory
+
+ Returns:
+ List of symbol dictionaries with fields:
+ - name (str): Symbol name
+ - value (int): Symbol address (absolute VA)
+ - section_number (int): Always 0 for PDB symbols
+ - type (str): Type information if available
+ - storage_class (int): Always 0 for PDB symbols
+
+ Example:
+ ```python
+ # Basic usage
+ symbols = pdb_symbols_for_pe("kernel32.dll")
+ for sym in symbols:
+ print(f"{sym['name']:40s} @ {sym['value']:#010x}")
+
+ # With symbol server
+ symbols = pdb_symbols_for_pe(
+ "app.exe",
+ "SRV*C:\\SymCache*https://msdl.microsoft.com/download/symbols"
+ )
+
+ # Filter functions only
+ functions = [s for s in symbols if s.get('is_function', False)]
+ ```
+
+ Note:
+ - Returns empty list if PDB not found or on non-Windows platforms
+ - Symbol addresses are absolute (not RVAs)
+ - Errors are caught and logged, returning empty list
+ - For more control, use PdbSession directly
+
+ Integration:
+ This function is called automatically by objutils.pecoff.PeParser
+ when COFF symbol table is empty.
+ """
+ if not _WINDOWS:
+ return []
+
+ try:
+ with PdbSession(symbol_path if not symbol_path else [symbol_path]) as session:
+ mod_base = session.load_module(pe_path)
+ pdb_symbols = session.enum_symbols(mod_base, b"*")
+ result = []
+ for sym in pdb_symbols:
+ if sym.tag != "SymTagData":
+ continue
+ ti = session.type_info(mod_base, sym.TypeIndex)
+ print(f"{sym.tag:15}", sym.Name, hex(sym.Address - mod_base), sym.Size, sym.decode_flags(), "==>", ti)
+ result.append(symbols.VariableType(sym.Name, ti, sym.Address - mod_base, sym.Size))
+ return result
+ """
+ Name: Symbol name (null-terminated char array)
+ Address: Absolute address in memory
+ ModBase: Module base address
+ Flags: Symbol flags (SymFlag enum values)
+ Tag: Symbol tag type (SymTagEnum values)
+ Size: Symbol size in bytes
+ Value: Symbol value (for constants)
+
+ Helper Methods:
+ is_function(): True if symbol is a function
+ is_export(): True if symbol is exported
+ is_local(): True if symbol is local variable
+ is_parameter(): True if symbol is function parameter
+ decode_flags(): List of flag names
+ """
+
+ except (OSError, RuntimeError, ValueError) as e:
+ print(f"Error: {str(e)}")
+ return [] # Return an empty list in case of errors.
+
+
+def main(pe_path: str): # pragma: no cover - debug helper
+ items = pdb_symbols_for_pe(pe_path)
+ for it in items[:50]:
+ print(f"{it['name']} : {it.get('type', 'unknown')} @ 0x{it['value']:016X}")
diff --git a/objutils/scripts/oj_coff_syms.py b/objutils/scripts/oj_coff_syms.py
index d79bb2e..c289ff7 100644
--- a/objutils/scripts/oj_coff_syms.py
+++ b/objutils/scripts/oj_coff_syms.py
@@ -5,7 +5,7 @@
import argparse
-from objutils.pecoff import PeParser, SymbolAPI
+from objutils.pecoff import PeParser
def main(argv: list[str] | None = None) -> int:
@@ -39,7 +39,7 @@ def main(argv: list[str] | None = None) -> int:
syms = pp.symbols
# Fallback: if SymbolAPI attr is not present (static type), use direct list
- #if not syms and pp.symbols:
+ # if not syms and pp.symbols:
# syms = [type("_S", (), s) for s in pp.symbols] # quick adapter for printing
# syms is a list of model.Pe_Symbol; order by value already in fetch()
diff --git a/objutils/symbols.py b/objutils/symbols.py
index 13b5f0d..5e498bb 100644
--- a/objutils/symbols.py
+++ b/objutils/symbols.py
@@ -1,442 +1,440 @@
-"""General symbol abstraction that works on top of DWARF, PDB, or whatsoever.
-
-This module provides a format-neutral type system that can represent type information
-from DWARF (ELF debug info), PDB (Windows debug info), or any other debug format.
-
-The central abstraction for primitive types is :class:`TypeEncoding`, which replaces
-the previous format-specific ``encoding: Any`` fields. Two helper functions translate
-format-specific values to :class:`TypeEncoding`:
-
-- :func:`type_encoding_from_dwarf_ate` – converts a DWARF ``DW_ATE_*`` integer value
-- :func:`type_encoding_from_pdb_bt` – converts a PDB ``BasicType`` (``btXxx``) integer value
-"""
-
-from __future__ import annotations
-
-import enum
-from dataclasses import dataclass, field
-from typing import Any, TypeAlias
-
-
-# ---------------------------------------------------------------------------
-# Format-neutral type encoding
-# ---------------------------------------------------------------------------
-
-
-class TypeKind(enum.Enum):
- """Fundamental type category, independent of any debug-format specifics.
-
- Groups:
- Basic: VOID, BOOLEAN, ADDRESS, INTEGER, FLOAT
- Float ext.: COMPLEX_FLOAT, IMAGINARY_FLOAT, DECIMAL_FLOAT
- Characters: CHAR (repertoire stored in :class:`CharEncoding`)
- Scaled: FIXED, PACKED_DECIMAL, NUMERIC_STRING, EDITED
- Windows/COM: BCD, BIT, CURRENCY, DATE, VARIANT, HRESULT, BSTR
- Fallback: UNKNOWN
- """
-
- VOID = "void"
- BOOLEAN = "boolean"
- ADDRESS = "address"
- INTEGER = "integer"
- FLOAT = "float"
- COMPLEX_FLOAT = "complex_float"
- IMAGINARY_FLOAT = "imaginary_float"
- DECIMAL_FLOAT = "decimal_float"
- CHAR = "char"
- FIXED = "fixed"
- PACKED_DECIMAL = "packed_decimal"
- NUMERIC_STRING = "numeric_string"
- EDITED = "edited"
- BCD = "bcd"
- BIT = "bit"
- CURRENCY = "currency"
- DATE = "date"
- VARIANT = "variant"
- HRESULT = "hresult"
- BSTR = "bstr"
- UNKNOWN = "unknown"
-
-
-class Signedness(enum.Enum):
- """Sign property of a type.
-
- Attributes:
- SIGNED: Explicitly signed (e.g. ``int``, ``signed char``).
- UNSIGNED: Explicitly unsigned (e.g. ``unsigned int``, ``uint8_t``).
- NOT_APPLICABLE: Signedness is semantically meaningless for this kind
- (float, void, bool, unicode char, address, …).
- UNSPECIFIED: Signedness is theoretically applicable but not yet
- determined – e.g. plain ``char`` whose signedness is
- implementation-defined in C.
- """
-
- SIGNED = "signed"
- UNSIGNED = "unsigned"
- NOT_APPLICABLE = "n/a"
- UNSPECIFIED = "unspecified"
-
-
-class CharEncoding(enum.Enum):
- """Character repertoire / encoding, only meaningful when :attr:`TypeKind` is ``CHAR``.
-
- Attributes:
- UNSPECIFIED: Byte-sized ``char`` without a specified encoding (C ``char``).
- ASCII: ISO/IEC 646:1991 – Fortran ``ASCII`` kind; DWARF ``DW_ATE_ASCII``.
- UCS: ISO/IEC 10646 UCS-4 – Fortran ``ISO_10646`` kind; DWARF ``DW_ATE_UCS``.
- UTF: ISO/IEC 10646-1:1993 (general Unicode); DWARF ``DW_ATE_UTF``.
- UTF8: UTF-8 – C23/C++20 ``char8_t`` (unsigned); PDB ``btChar8``.
- UTF16: UTF-16 – C++11 ``char16_t``; PDB ``btChar16``.
- UTF32: UTF-32 – C++11 ``char32_t``; PDB ``btChar32``.
- WIDE: Platform-defined wide character ``wchar_t``; PDB ``btWChar``.
- """
-
- UNSPECIFIED = "unspecified"
- ASCII = "ascii"
- UCS = "ucs"
- UTF = "utf"
- UTF8 = "utf8"
- UTF16 = "utf16"
- UTF32 = "utf32"
- WIDE = "wide"
-
-
-@dataclass(frozen=True)
-class TypeEncoding:
- """Format-neutral encoding descriptor for a primitive type.
-
- Combines a :class:`TypeKind` with optional :class:`Signedness` and
- :class:`CharEncoding` qualifiers. The dataclass is *frozen* so instances
- are hashable and can be used as dict or set keys.
-
- Args:
- kind: Fundamental category of the type.
- signedness: Sign property (defaults to :attr:`Signedness.NOT_APPLICABLE`).
- char_encoding: Character repertoire for ``CHAR`` types
- (defaults to :attr:`CharEncoding.UNSPECIFIED`).
-
- Examples::
-
- >>> TypeEncoding(TypeKind.INTEGER, Signedness.SIGNED)
- TypeEncoding(kind=INTEGER, signedness=SIGNED, char_encoding=UNSPECIFIED)
- >>> TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.UTF16)
- TypeEncoding(kind=CHAR, signedness=NOT_APPLICABLE, char_encoding=UTF16)
-
- Use :func:`type_encoding_from_dwarf_ate` and :func:`type_encoding_from_pdb_bt`
- to construct instances from format-specific values.
- """
-
- kind: TypeKind
- signedness: Signedness = Signedness.NOT_APPLICABLE
- char_encoding: CharEncoding = CharEncoding.UNSPECIFIED
-
- # ------------------------------------------------------------------
- # Convenience predicates
- # ------------------------------------------------------------------
-
- def is_signed(self) -> bool:
- """Return ``True`` if the type is explicitly signed."""
- return self.signedness == Signedness.SIGNED
-
- def is_unsigned(self) -> bool:
- """Return ``True`` if the type is explicitly unsigned."""
- return self.signedness == Signedness.UNSIGNED
-
- def is_integer(self) -> bool:
- """Return ``True`` for integer kinds (signed or unsigned)."""
- return self.kind == TypeKind.INTEGER
-
- def is_float(self) -> bool:
- """Return ``True`` for any floating-point kind."""
- return self.kind in (TypeKind.FLOAT, TypeKind.COMPLEX_FLOAT, TypeKind.IMAGINARY_FLOAT, TypeKind.DECIMAL_FLOAT)
-
- def is_char(self) -> bool:
- """Return ``True`` for character types."""
- return self.kind == TypeKind.CHAR
-
- def is_void(self) -> bool:
- """Return ``True`` for the void type."""
- return self.kind == TypeKind.VOID
-
- def is_boolean(self) -> bool:
- """Return ``True`` for boolean types."""
- return self.kind == TypeKind.BOOLEAN
-
- def __str__(self) -> str:
- parts: list[str] = []
- if self.signedness not in (Signedness.NOT_APPLICABLE, Signedness.UNSPECIFIED):
- parts.append(self.signedness.value)
- parts.append(self.kind.value)
- if self.char_encoding != CharEncoding.UNSPECIFIED:
- parts.append(f"({self.char_encoding.value})")
- return " ".join(parts)
-
- def __repr__(self) -> str:
- return (
- f"TypeEncoding(kind={self.kind.name}, signedness={self.signedness.name}, "
- f"char_encoding={self.char_encoding.name})"
- )
-
-
-# ---------------------------------------------------------------------------
-# Conversion: DWARF DW_ATE_* → TypeEncoding
-# ---------------------------------------------------------------------------
-
-# Raw integer keys are the DW_ATE_* values from DWARF4 Table 5.1.
-# HP vendor extensions (0x80–0x8B) are not listed here; they map to UNKNOWN.
-_DWARF_ATE_MAP: dict[int, TypeEncoding] = {
- 0x0: TypeEncoding(TypeKind.VOID), # void (compiler extension, not in spec)
- 0x1: TypeEncoding(TypeKind.ADDRESS, Signedness.NOT_APPLICABLE), # DW_ATE_address
- 0x2: TypeEncoding(TypeKind.BOOLEAN, Signedness.NOT_APPLICABLE), # DW_ATE_boolean
- 0x3: TypeEncoding(TypeKind.COMPLEX_FLOAT, Signedness.NOT_APPLICABLE), # DW_ATE_complex_float
- 0x4: TypeEncoding(TypeKind.FLOAT, Signedness.NOT_APPLICABLE), # DW_ATE_float
- 0x5: TypeEncoding(TypeKind.INTEGER, Signedness.SIGNED), # DW_ATE_signed
- 0x6: TypeEncoding(TypeKind.CHAR, Signedness.SIGNED, CharEncoding.UNSPECIFIED), # DW_ATE_signed_char
- 0x7: TypeEncoding(TypeKind.INTEGER, Signedness.UNSIGNED), # DW_ATE_unsigned
- 0x8: TypeEncoding(TypeKind.CHAR, Signedness.UNSIGNED, CharEncoding.UNSPECIFIED), # DW_ATE_unsigned_char
- 0x9: TypeEncoding(TypeKind.IMAGINARY_FLOAT, Signedness.NOT_APPLICABLE), # DW_ATE_imaginary_float
- 0xA: TypeEncoding(TypeKind.PACKED_DECIMAL), # DW_ATE_packed_decimal
- 0xB: TypeEncoding(TypeKind.NUMERIC_STRING), # DW_ATE_numeric_string
- 0xC: TypeEncoding(TypeKind.EDITED), # DW_ATE_edited
- 0xD: TypeEncoding(TypeKind.FIXED, Signedness.SIGNED), # DW_ATE_signed_fixed
- 0xE: TypeEncoding(TypeKind.FIXED, Signedness.UNSIGNED), # DW_ATE_unsigned_fixed
- 0xF: TypeEncoding(TypeKind.DECIMAL_FLOAT, Signedness.NOT_APPLICABLE), # DW_ATE_decimal_float
- 0x10: TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.UTF), # DW_ATE_UTF (char16_t / char32_t / u8 in C++)
- 0x11: TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.UCS), # DW_ATE_UCS (Fortran ISO_10646)
- 0x12: TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.ASCII), # DW_ATE_ASCII (Fortran ASCII kind)
-}
-
-
-def type_encoding_from_dwarf_ate(ate_value: int) -> TypeEncoding:
- """Convert a DWARF ``DW_ATE_*`` integer value to a :class:`TypeEncoding`.
-
- Args:
- ate_value: Raw ``DW_AT_encoding`` value (e.g. ``BaseTypeEncoding.signed`` = 5).
-
- Returns:
- Matching :class:`TypeEncoding`, or ``TypeEncoding(TypeKind.UNKNOWN)`` for
- unrecognised or vendor-extension values.
-
- Example::
-
- >>> from objutils.dwarf.constants import BaseTypeEncoding
- >>> type_encoding_from_dwarf_ate(int(BaseTypeEncoding.float))
- TypeEncoding(kind=FLOAT, signedness=NOT_APPLICABLE, char_encoding=UNSPECIFIED)
- """
- return _DWARF_ATE_MAP.get(int(ate_value), TypeEncoding(TypeKind.UNKNOWN))
-
-
-# ---------------------------------------------------------------------------
-# Conversion: PDB BasicType → TypeEncoding
-# ---------------------------------------------------------------------------
-
-# Raw integer keys are the btXxx values from Microsoft cvconst.h.
-_PDB_BT_MAP: dict[int, TypeEncoding] = {
- 0: TypeEncoding(TypeKind.UNKNOWN), # btNoType
- 1: TypeEncoding(TypeKind.VOID), # btVoid
- 2: TypeEncoding(TypeKind.CHAR, Signedness.UNSPECIFIED, CharEncoding.ASCII), # btChar – plain C char (impl-defined signedness)
- 3: TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.WIDE), # btWChar – wchar_t
- 6: TypeEncoding(TypeKind.INTEGER, Signedness.SIGNED), # btInt
- 7: TypeEncoding(TypeKind.INTEGER, Signedness.UNSIGNED), # btUInt
- 8: TypeEncoding(TypeKind.FLOAT, Signedness.NOT_APPLICABLE), # btFloat
- 9: TypeEncoding(TypeKind.BCD), # btBCD
- 10: TypeEncoding(TypeKind.BOOLEAN, Signedness.NOT_APPLICABLE), # btBool
- 13: TypeEncoding(TypeKind.INTEGER, Signedness.SIGNED), # btLong (size captured in byte_size)
- 14: TypeEncoding(TypeKind.INTEGER, Signedness.UNSIGNED), # btULong
- 25: TypeEncoding(TypeKind.CURRENCY), # btCurrency
- 26: TypeEncoding(TypeKind.DATE), # btDate
- 27: TypeEncoding(TypeKind.VARIANT), # btVariant
- 28: TypeEncoding(TypeKind.COMPLEX_FLOAT, Signedness.NOT_APPLICABLE), # btComplex
- 29: TypeEncoding(TypeKind.BIT), # btBit
- 30: TypeEncoding(TypeKind.BSTR), # btBSTR
- 31: TypeEncoding(TypeKind.HRESULT), # btHresult
- 32: TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.UTF16), # btChar16 – char16_t (C++11)
- 33: TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.UTF32), # btChar32 – char32_t (C++11)
- 34: TypeEncoding(TypeKind.CHAR, Signedness.UNSIGNED, CharEncoding.UTF8), # btChar8 – char8_t (C++20, always unsigned)
-}
-
-
-def type_encoding_from_pdb_bt(bt_value: int) -> TypeEncoding:
- """Convert a PDB ``BasicType`` integer value to a :class:`TypeEncoding`.
-
- Args:
- bt_value: Raw ``BasicType`` value from dbghelp/cvconst.h
- (e.g. ``BasicType.btFloat`` = 8).
-
- Returns:
- Matching :class:`TypeEncoding`, or ``TypeEncoding(TypeKind.UNKNOWN)`` for
- unrecognised values.
-
- Example::
-
- >>> from objutils.pecoff.pdb import BasicType
- >>> type_encoding_from_pdb_bt(int(BasicType.btFloat))
- TypeEncoding(kind=FLOAT, signedness=NOT_APPLICABLE, char_encoding=UNSPECIFIED)
- """
- return _PDB_BT_MAP.get(int(bt_value), TypeEncoding(TypeKind.UNKNOWN))
-
-
-# ---------------------------------------------------------------------------
-# Symbol / type dataclasses
-# ---------------------------------------------------------------------------
-
-
-@dataclass
-class PrimitiveType:
- """A primitive / base type.
-
- Attributes:
- name: Type name as it appears in the source (e.g. ``"int``, ``"float"``).
- encoding: Format-neutral :class:`TypeEncoding` describing how the value
- is encoded and interpreted.
- byte_size: Storage size in bytes.
- """
-
- name: str
- encoding: TypeEncoding
- byte_size: int
-
-
-@dataclass
-class ArrayType:
- type: TypeInfo
- array_spec: list[tuple[int, int]] = field(default_factory=list)
-
-
-@dataclass
-class TypeDefinition:
- name: str
- type: TypeInfo
-
-
-@dataclass
-class VolatileType:
- type: TypeInfo
-
-
-@dataclass
-class ConstantType:
- type: TypeInfo
-
-
-@dataclass
-class PointerType:
- type: TypeInfo
-
-
-@dataclass
-class ReferenceType:
- type: TypeInfo
-
-
-@dataclass
-class Enumerator:
- name: str
- value: int
-
-
-@dataclass
-class EnumerationType:
- """An enumeration type.
-
- Attributes:
- name: Enumeration name.
- byte_size: Storage size in bytes.
- encoding: :class:`TypeEncoding` of the underlying integer type,
- or ``None`` when not determinable.
- base_type: Resolved underlying type (usually a :class:`PrimitiveType`).
- enumerators: List of named enumeration constants.
- """
-
- name: str
- byte_size: int
- encoding: TypeEncoding | None
- base_type: TypeInfo
- enumerators: list[Enumerator] = field(default_factory=list)
-
-
-@dataclass
-class UnspecifiedType:
- name: str
-
-
-@dataclass
-class StructMember:
- name: str
- type: TypeInfo
- offset: int
-
-
-@dataclass
-class StructureType:
- name: str
- byte_size: int
- member: list[StructMember] = field(default_factory=list)
-
-
-@dataclass
-class ClassMember:
- name: str
- linkage_name: str
- type: TypeInfo
- offset: int
- accessibility: Any # Accessibility
- external: bool
-
-
-@dataclass
-class ClassType:
- name: str
- byte_size: int
- member: list[ClassMember] = field(default_factory=list)
-
-
-@dataclass
-class UnionType:
- name: str
- byte_size: int
- alternatives: list[StructMember] = field(default_factory=list)
-
-
-@dataclass
-class SubroutineType:
- name: str
- prototyped: int
- return_type: TypeInfo
- parameters: list[TypeInfo] = field(default_factory=list)
-
-
-@dataclass
-class VariableType:
- name: str
- type: TypeInfo
- location: int
- size: int
-
-
-@dataclass
-class DataType:
- name: str
- value: Any
- type: TypeInfo
- datakind: Any
-
-
-TypeInfo: TypeAlias = (
- PrimitiveType
- | ArrayType
- | TypeDefinition
- | VolatileType
- | ConstantType
- | PointerType
- | ReferenceType
- | EnumerationType
- | UnspecifiedType
- | StructureType
- | ClassType
- | UnionType
- | SubroutineType
-)
-
-# Backward-compatible alias (legacy typo retained intentionally).
-TypeDefiniton = TypeDefinition
+"""General symbol abstraction that works on top of DWARF, PDB, or whatsoever.
+
+This module provides a format-neutral type system that can represent type information
+from DWARF (ELF debug info), PDB (Windows debug info), or any other debug format.
+
+The central abstraction for primitive types is :class:`TypeEncoding`, which replaces
+the previous format-specific ``encoding: Any`` fields. Two helper functions translate
+format-specific values to :class:`TypeEncoding`:
+
+- :func:`type_encoding_from_dwarf_ate` – converts a DWARF ``DW_ATE_*`` integer value
+- :func:`type_encoding_from_pdb_bt` – converts a PDB ``BasicType`` (``btXxx``) integer value
+"""
+
+from __future__ import annotations
+
+import enum
+from dataclasses import dataclass, field
+from typing import Any, TypeAlias
+
+# ---------------------------------------------------------------------------
+# Format-neutral type encoding
+# ---------------------------------------------------------------------------
+
+
+class TypeKind(enum.Enum):
+ """Fundamental type category, independent of any debug-format specifics.
+
+ Groups:
+ Basic: VOID, BOOLEAN, ADDRESS, INTEGER, FLOAT
+ Float ext.: COMPLEX_FLOAT, IMAGINARY_FLOAT, DECIMAL_FLOAT
+ Characters: CHAR (repertoire stored in :class:`CharEncoding`)
+ Scaled: FIXED, PACKED_DECIMAL, NUMERIC_STRING, EDITED
+ Windows/COM: BCD, BIT, CURRENCY, DATE, VARIANT, HRESULT, BSTR
+ Fallback: UNKNOWN
+ """
+
+ VOID = "void"
+ BOOLEAN = "boolean"
+ ADDRESS = "address"
+ INTEGER = "integer"
+ FLOAT = "float"
+ COMPLEX_FLOAT = "complex_float"
+ IMAGINARY_FLOAT = "imaginary_float"
+ DECIMAL_FLOAT = "decimal_float"
+ CHAR = "char"
+ FIXED = "fixed"
+ PACKED_DECIMAL = "packed_decimal"
+ NUMERIC_STRING = "numeric_string"
+ EDITED = "edited"
+ BCD = "bcd"
+ BIT = "bit"
+ CURRENCY = "currency"
+ DATE = "date"
+ VARIANT = "variant"
+ HRESULT = "hresult"
+ BSTR = "bstr"
+ UNKNOWN = "unknown"
+
+
+class Signedness(enum.Enum):
+ """Sign property of a type.
+
+ Attributes:
+ SIGNED: Explicitly signed (e.g. ``int``, ``signed char``).
+ UNSIGNED: Explicitly unsigned (e.g. ``unsigned int``, ``uint8_t``).
+ NOT_APPLICABLE: Signedness is semantically meaningless for this kind
+ (float, void, bool, unicode char, address, …).
+ UNSPECIFIED: Signedness is theoretically applicable but not yet
+ determined – e.g. plain ``char`` whose signedness is
+ implementation-defined in C.
+ """
+
+ SIGNED = "signed"
+ UNSIGNED = "unsigned"
+ NOT_APPLICABLE = "n/a"
+ UNSPECIFIED = "unspecified"
+
+
+class CharEncoding(enum.Enum):
+ """Character repertoire / encoding, only meaningful when :attr:`TypeKind` is ``CHAR``.
+
+ Attributes:
+ UNSPECIFIED: Byte-sized ``char`` without a specified encoding (C ``char``).
+ ASCII: ISO/IEC 646:1991 – Fortran ``ASCII`` kind; DWARF ``DW_ATE_ASCII``.
+ UCS: ISO/IEC 10646 UCS-4 – Fortran ``ISO_10646`` kind; DWARF ``DW_ATE_UCS``.
+ UTF: ISO/IEC 10646-1:1993 (general Unicode); DWARF ``DW_ATE_UTF``.
+ UTF8: UTF-8 – C23/C++20 ``char8_t`` (unsigned); PDB ``btChar8``.
+ UTF16: UTF-16 – C++11 ``char16_t``; PDB ``btChar16``.
+ UTF32: UTF-32 – C++11 ``char32_t``; PDB ``btChar32``.
+ WIDE: Platform-defined wide character ``wchar_t``; PDB ``btWChar``.
+ """
+
+ UNSPECIFIED = "unspecified"
+ ASCII = "ascii"
+ UCS = "ucs"
+ UTF = "utf"
+ UTF8 = "utf8"
+ UTF16 = "utf16"
+ UTF32 = "utf32"
+ WIDE = "wide"
+
+
+@dataclass(frozen=True)
+class TypeEncoding:
+ """Format-neutral encoding descriptor for a primitive type.
+
+ Combines a :class:`TypeKind` with optional :class:`Signedness` and
+ :class:`CharEncoding` qualifiers. The dataclass is *frozen* so instances
+ are hashable and can be used as dict or set keys.
+
+ Args:
+ kind: Fundamental category of the type.
+ signedness: Sign property (defaults to :attr:`Signedness.NOT_APPLICABLE`).
+ char_encoding: Character repertoire for ``CHAR`` types
+ (defaults to :attr:`CharEncoding.UNSPECIFIED`).
+
+ Examples::
+
+ >>> TypeEncoding(TypeKind.INTEGER, Signedness.SIGNED)
+ TypeEncoding(kind=INTEGER, signedness=SIGNED, char_encoding=UNSPECIFIED)
+ >>> TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.UTF16)
+ TypeEncoding(kind=CHAR, signedness=NOT_APPLICABLE, char_encoding=UTF16)
+
+ Use :func:`type_encoding_from_dwarf_ate` and :func:`type_encoding_from_pdb_bt`
+ to construct instances from format-specific values.
+ """
+
+ kind: TypeKind
+ signedness: Signedness = Signedness.NOT_APPLICABLE
+ char_encoding: CharEncoding = CharEncoding.UNSPECIFIED
+
+ # ------------------------------------------------------------------
+ # Convenience predicates
+ # ------------------------------------------------------------------
+
+ def is_signed(self) -> bool:
+ """Return ``True`` if the type is explicitly signed."""
+ return self.signedness == Signedness.SIGNED
+
+ def is_unsigned(self) -> bool:
+ """Return ``True`` if the type is explicitly unsigned."""
+ return self.signedness == Signedness.UNSIGNED
+
+ def is_integer(self) -> bool:
+ """Return ``True`` for integer kinds (signed or unsigned)."""
+ return self.kind == TypeKind.INTEGER
+
+ def is_float(self) -> bool:
+ """Return ``True`` for any floating-point kind."""
+ return self.kind in (TypeKind.FLOAT, TypeKind.COMPLEX_FLOAT, TypeKind.IMAGINARY_FLOAT, TypeKind.DECIMAL_FLOAT)
+
+ def is_char(self) -> bool:
+ """Return ``True`` for character types."""
+ return self.kind == TypeKind.CHAR
+
+ def is_void(self) -> bool:
+ """Return ``True`` for the void type."""
+ return self.kind == TypeKind.VOID
+
+ def is_boolean(self) -> bool:
+ """Return ``True`` for boolean types."""
+ return self.kind == TypeKind.BOOLEAN
+
+ def __str__(self) -> str:
+ parts: list[str] = []
+ if self.signedness not in (Signedness.NOT_APPLICABLE, Signedness.UNSPECIFIED):
+ parts.append(self.signedness.value)
+ parts.append(self.kind.value)
+ if self.char_encoding != CharEncoding.UNSPECIFIED:
+ parts.append(f"({self.char_encoding.value})")
+ return " ".join(parts)
+
+ def __repr__(self) -> str:
+ return (
+ f"TypeEncoding(kind={self.kind.name}, signedness={self.signedness.name}, " f"char_encoding={self.char_encoding.name})"
+ )
+
+
+# ---------------------------------------------------------------------------
+# Conversion: DWARF DW_ATE_* → TypeEncoding
+# ---------------------------------------------------------------------------
+
+# Raw integer keys are the DW_ATE_* values from DWARF4 Table 5.1.
+# HP vendor extensions (0x80–0x8B) are not listed here; they map to UNKNOWN.
+_DWARF_ATE_MAP: dict[int, TypeEncoding] = {
+ 0x0: TypeEncoding(TypeKind.VOID), # void (compiler extension, not in spec)
+ 0x1: TypeEncoding(TypeKind.ADDRESS, Signedness.NOT_APPLICABLE), # DW_ATE_address
+ 0x2: TypeEncoding(TypeKind.BOOLEAN, Signedness.NOT_APPLICABLE), # DW_ATE_boolean
+ 0x3: TypeEncoding(TypeKind.COMPLEX_FLOAT, Signedness.NOT_APPLICABLE), # DW_ATE_complex_float
+ 0x4: TypeEncoding(TypeKind.FLOAT, Signedness.NOT_APPLICABLE), # DW_ATE_float
+ 0x5: TypeEncoding(TypeKind.INTEGER, Signedness.SIGNED), # DW_ATE_signed
+ 0x6: TypeEncoding(TypeKind.CHAR, Signedness.SIGNED, CharEncoding.UNSPECIFIED), # DW_ATE_signed_char
+ 0x7: TypeEncoding(TypeKind.INTEGER, Signedness.UNSIGNED), # DW_ATE_unsigned
+ 0x8: TypeEncoding(TypeKind.CHAR, Signedness.UNSIGNED, CharEncoding.UNSPECIFIED), # DW_ATE_unsigned_char
+ 0x9: TypeEncoding(TypeKind.IMAGINARY_FLOAT, Signedness.NOT_APPLICABLE), # DW_ATE_imaginary_float
+ 0xA: TypeEncoding(TypeKind.PACKED_DECIMAL), # DW_ATE_packed_decimal
+ 0xB: TypeEncoding(TypeKind.NUMERIC_STRING), # DW_ATE_numeric_string
+ 0xC: TypeEncoding(TypeKind.EDITED), # DW_ATE_edited
+ 0xD: TypeEncoding(TypeKind.FIXED, Signedness.SIGNED), # DW_ATE_signed_fixed
+ 0xE: TypeEncoding(TypeKind.FIXED, Signedness.UNSIGNED), # DW_ATE_unsigned_fixed
+ 0xF: TypeEncoding(TypeKind.DECIMAL_FLOAT, Signedness.NOT_APPLICABLE), # DW_ATE_decimal_float
+ 0x10: TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.UTF), # DW_ATE_UTF (char16_t / char32_t / u8 in C++)
+ 0x11: TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.UCS), # DW_ATE_UCS (Fortran ISO_10646)
+ 0x12: TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.ASCII), # DW_ATE_ASCII (Fortran ASCII kind)
+}
+
+
+def type_encoding_from_dwarf_ate(ate_value: int) -> TypeEncoding:
+ """Convert a DWARF ``DW_ATE_*`` integer value to a :class:`TypeEncoding`.
+
+ Args:
+ ate_value: Raw ``DW_AT_encoding`` value (e.g. ``BaseTypeEncoding.signed`` = 5).
+
+ Returns:
+ Matching :class:`TypeEncoding`, or ``TypeEncoding(TypeKind.UNKNOWN)`` for
+ unrecognised or vendor-extension values.
+
+ Example::
+
+ >>> from objutils.dwarf.constants import BaseTypeEncoding
+ >>> type_encoding_from_dwarf_ate(int(BaseTypeEncoding.float))
+ TypeEncoding(kind=FLOAT, signedness=NOT_APPLICABLE, char_encoding=UNSPECIFIED)
+ """
+ return _DWARF_ATE_MAP.get(int(ate_value), TypeEncoding(TypeKind.UNKNOWN))
+
+
+# ---------------------------------------------------------------------------
+# Conversion: PDB BasicType → TypeEncoding
+# ---------------------------------------------------------------------------
+
+# Raw integer keys are the btXxx values from Microsoft cvconst.h.
+_PDB_BT_MAP: dict[int, TypeEncoding] = {
+ 0: TypeEncoding(TypeKind.UNKNOWN), # btNoType
+ 1: TypeEncoding(TypeKind.VOID), # btVoid
+ 2: TypeEncoding(TypeKind.CHAR, Signedness.UNSPECIFIED, CharEncoding.ASCII), # btChar – plain C char (impl-defined signedness)
+ 3: TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.WIDE), # btWChar – wchar_t
+ 6: TypeEncoding(TypeKind.INTEGER, Signedness.SIGNED), # btInt
+ 7: TypeEncoding(TypeKind.INTEGER, Signedness.UNSIGNED), # btUInt
+ 8: TypeEncoding(TypeKind.FLOAT, Signedness.NOT_APPLICABLE), # btFloat
+ 9: TypeEncoding(TypeKind.BCD), # btBCD
+ 10: TypeEncoding(TypeKind.BOOLEAN, Signedness.NOT_APPLICABLE), # btBool
+ 13: TypeEncoding(TypeKind.INTEGER, Signedness.SIGNED), # btLong (size captured in byte_size)
+ 14: TypeEncoding(TypeKind.INTEGER, Signedness.UNSIGNED), # btULong
+ 25: TypeEncoding(TypeKind.CURRENCY), # btCurrency
+ 26: TypeEncoding(TypeKind.DATE), # btDate
+ 27: TypeEncoding(TypeKind.VARIANT), # btVariant
+ 28: TypeEncoding(TypeKind.COMPLEX_FLOAT, Signedness.NOT_APPLICABLE), # btComplex
+ 29: TypeEncoding(TypeKind.BIT), # btBit
+ 30: TypeEncoding(TypeKind.BSTR), # btBSTR
+ 31: TypeEncoding(TypeKind.HRESULT), # btHresult
+ 32: TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.UTF16), # btChar16 – char16_t (C++11)
+ 33: TypeEncoding(TypeKind.CHAR, Signedness.NOT_APPLICABLE, CharEncoding.UTF32), # btChar32 – char32_t (C++11)
+ 34: TypeEncoding(TypeKind.CHAR, Signedness.UNSIGNED, CharEncoding.UTF8), # btChar8 – char8_t (C++20, always unsigned)
+}
+
+
+def type_encoding_from_pdb_bt(bt_value: int) -> TypeEncoding:
+ """Convert a PDB ``BasicType`` integer value to a :class:`TypeEncoding`.
+
+ Args:
+ bt_value: Raw ``BasicType`` value from dbghelp/cvconst.h
+ (e.g. ``BasicType.btFloat`` = 8).
+
+ Returns:
+ Matching :class:`TypeEncoding`, or ``TypeEncoding(TypeKind.UNKNOWN)`` for
+ unrecognised values.
+
+ Example::
+
+ >>> from objutils.pecoff.pdb import BasicType
+ >>> type_encoding_from_pdb_bt(int(BasicType.btFloat))
+ TypeEncoding(kind=FLOAT, signedness=NOT_APPLICABLE, char_encoding=UNSPECIFIED)
+ """
+ return _PDB_BT_MAP.get(int(bt_value), TypeEncoding(TypeKind.UNKNOWN))
+
+
+# ---------------------------------------------------------------------------
+# Symbol / type dataclasses
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class PrimitiveType:
+ """A primitive / base type.
+
+ Attributes:
+ name: Type name as it appears in the source (e.g. ``"int``, ``"float"``).
+ encoding: Format-neutral :class:`TypeEncoding` describing how the value
+ is encoded and interpreted.
+ byte_size: Storage size in bytes.
+ """
+
+ name: str
+ encoding: TypeEncoding
+ byte_size: int
+
+
+@dataclass
+class ArrayType:
+ type: TypeInfo
+ array_spec: list[tuple[int, int]] = field(default_factory=list)
+
+
+@dataclass
+class TypeDefinition:
+ name: str
+ type: TypeInfo
+
+
+@dataclass
+class VolatileType:
+ type: TypeInfo
+
+
+@dataclass
+class ConstantType:
+ type: TypeInfo
+
+
+@dataclass
+class PointerType:
+ type: TypeInfo
+
+
+@dataclass
+class ReferenceType:
+ type: TypeInfo
+
+
+@dataclass
+class Enumerator:
+ name: str
+ value: int
+
+
+@dataclass
+class EnumerationType:
+ """An enumeration type.
+
+ Attributes:
+ name: Enumeration name.
+ byte_size: Storage size in bytes.
+ encoding: :class:`TypeEncoding` of the underlying integer type,
+ or ``None`` when not determinable.
+ base_type: Resolved underlying type (usually a :class:`PrimitiveType`).
+ enumerators: List of named enumeration constants.
+ """
+
+ name: str
+ byte_size: int
+ encoding: TypeEncoding | None
+ base_type: TypeInfo
+ enumerators: list[Enumerator] = field(default_factory=list)
+
+
+@dataclass
+class UnspecifiedType:
+ name: str
+
+
+@dataclass
+class StructMember:
+ name: str
+ type: TypeInfo
+ offset: int
+
+
+@dataclass
+class StructureType:
+ name: str
+ byte_size: int
+ member: list[StructMember] = field(default_factory=list)
+
+
+@dataclass
+class ClassMember:
+ name: str
+ linkage_name: str
+ type: TypeInfo
+ offset: int
+ accessibility: Any # Accessibility
+ external: bool
+
+
+@dataclass
+class ClassType:
+ name: str
+ byte_size: int
+ member: list[ClassMember] = field(default_factory=list)
+
+
+@dataclass
+class UnionType:
+ name: str
+ byte_size: int
+ alternatives: list[StructMember] = field(default_factory=list)
+
+
+@dataclass
+class SubroutineType:
+ name: str
+ prototyped: int
+ return_type: TypeInfo
+ parameters: list[TypeInfo] = field(default_factory=list)
+
+
+@dataclass
+class VariableType:
+ name: str
+ type: TypeInfo
+ location: int
+ size: int
+
+
+@dataclass
+class DataType:
+ name: str
+ value: Any
+ type: TypeInfo
+ datakind: Any
+
+
+TypeInfo: TypeAlias = (
+ PrimitiveType
+ | ArrayType
+ | TypeDefinition
+ | VolatileType
+ | ConstantType
+ | PointerType
+ | ReferenceType
+ | EnumerationType
+ | UnspecifiedType
+ | StructureType
+ | ClassType
+ | UnionType
+ | SubroutineType
+)
+
+# Backward-compatible alias (legacy typo retained intentionally).
+TypeDefiniton = TypeDefinition