From 212bf61435921e5bbc297651ed0e09414319886f Mon Sep 17 00:00:00 2001 From: Mayank Agrawal Date: Mon, 18 May 2026 15:17:56 +0530 Subject: [PATCH 1/3] SSP Support for Win-ARM64 --- winsup/utils/ssp.c | 328 ++++++++++++++++++++------------------------- 1 file changed, 142 insertions(+), 186 deletions(-) diff --git a/winsup/utils/ssp.c b/winsup/utils/ssp.c index af9489ab7..50463dc0f 100644 --- a/winsup/utils/ssp.c +++ b/winsup/utils/ssp.c @@ -52,30 +52,13 @@ typedef DWORD64 CONTEXT_REG; #define KERNEL_ADDR 0x00007FF000000000 #define CONTEXT_SP Sp #define CONTEXT_IP Pc -#define CONTEXT_LR Lr typedef DWORD64 CONTEXT_REG; #define CONTEXT_REG_FMT "%016llx" #define ADDR_SSCANF_FMT "%lli" -/* PSTATE.SS (Software Step) lives in bit[21]. Setting it requests a - single-step debug exception after the next instruction. */ -#define ARM64_PSR_SS 0x00200000 #else #error unimplemented for this target #endif -/* Software breakpoint instruction encoding differs by architecture. - - x86/x86_64 uses 1-byte INT3 (0xCC) - - AArch64 uses 4-byte BRK #0xF000 (0xD43E0000), which matches MSVC's - __debugbreak recommendation on Windows/ARM64. */ -#if defined(__aarch64__) -# define SW_BREAKPOINT_SIZE 4 -# define AARCH64_BRK_INSN 0xD43E0000u -#elif defined(__i386__) || defined(__x86_64__) -# define SW_BREAKPOINT_SIZE 1 -#else -# error unimplemented for this target -#endif - #define TRACE_SSP 0 #define VERBOSE 1 @@ -109,11 +92,18 @@ typedef struct { typedef struct { CONTEXT_REG address; - unsigned char real_bytes[SW_BREAKPOINT_SIZE]; +#if defined(__aarch64__) + unsigned char real_insn[4]; /* ARM64 instructions are 4 bytes */ +#else + unsigned char real_byte; +#endif } PendingBreakpoints; CONTEXT_REG low_pc, high_pc=0; CONTEXT_REG last_pc=0, pc, last_sp=0, sp; +#if defined(__aarch64__) +CONTEXT_REG last_lr=0, lr; +#endif int total_cycles, count; HANDLE hProcess; PROCESS_INFORMATION procinfo; @@ -148,43 +138,17 @@ int num_dlls=0; PendingBreakpoints pending_breakpoints[MAXPENDS]; int num_breakpoints=0; -static int -patch_code_bytes (CONTEXT_REG address, const void *bytes, SIZE_T len) -{ - DWORD old_prot = 0, tmp_prot = 0; - SIZE_T written = 0; - - if (!VirtualProtectEx (hProcess, (LPVOID)address, len, - PAGE_EXECUTE_READWRITE, &old_prot)) - return 0; - - if (!WriteProcessMemory (hProcess, (LPVOID)address, - bytes, len, &written) - || written != len) - { - VirtualProtectEx (hProcess, (LPVOID)address, len, old_prot, &tmp_prot); - return 0; - } - - FlushInstructionCache (hProcess, (LPCVOID)address, len); - VirtualProtectEx (hProcess, (LPVOID)address, len, old_prot, &tmp_prot); - return 1; -} - static void add_breakpoint (CONTEXT_REG address) { int i; - SIZE_T nread; - + SIZE_T rv; #if defined(__aarch64__) - /* brk #0xF000 */ - static const DWORD trap_insn = AARCH64_BRK_INSN; + static unsigned char brk_insn[] = { 0x00, 0x00, 0x20, 0xd4 }; /* BRK #0, which matches MSVC's + __debugbreak recommendation on Windows/ARM64.*/ #else - /* int3 */ - static const unsigned char trap_insn = 0xCC; + static unsigned char int3[] = { 0xcc }; #endif - for (i=0; i= num_breakpoints) num_breakpoints = i+1; } @@ -222,15 +184,23 @@ static int remove_breakpoint (CONTEXT_REG address) { int i; - + SIZE_T rv; for (i=0; i last_pc+10) - { - if (++qq % 100 == 0) - fprintf (stderr, " " CONTEXT_REG_FMT " %d %d \r", - pc, ncalls, opcode_count); - } - } -#else - if (pc < last_pc || pc > last_pc+10) + if (pc < last_pc || pc > last_pc+10) { static int ncalls=0; static int qq=0; + int is_call; if (++qq % 100 == 0) fprintf (stderr, " " CONTEXT_REG_FMT " %d %d \r", pc, ncalls, opcode_count); - - if (sp == last_sp-sizeof(CONTEXT_REG)) +#if defined(__aarch64__) + is_call = (lr != last_lr && lr == last_pc + 4); +#else + is_call = (sp == last_sp-sizeof(CONTEXT_REG)); +#endif + if (is_call) { - ncalls++; + ncalls++; store_call_edge (last_pc, pc); - if (last_pc < KERNEL_ADDR && pc > KERNEL_ADDR) + } + if (last_pc && last_pc < KERNEL_ADDR && pc > KERNEL_ADDR) + { +#if defined(__aarch64__) + /* On ARM64, the return address for a BL/BLR is in + LR. For a tail-call (B/BR) it isn't, but LR will + still hold the return address of whatever frame + made the original call into our code, so it's the + correct place to break to resume stepping. Place + a breakpoint there and stop single-stepping until + we return from the kernel/DLL call. */ + CONTEXT_REG retaddr = lr; + if (verbose) + printf ("skip kernel call: " CONTEXT_REG_FMT " -> " CONTEXT_REG_FMT ", ret = " CONTEXT_REG_FMT "\n", + last_pc, pc, retaddr); + if (retaddr && retaddr < KERNEL_ADDR) { -#if 0 - CONTEXT_REG retaddr; - SIZE_T rv; - ReadProcessMemory (hProcess, - (void *)sp, - (LPVOID)&(retaddr), - sizeof(retaddr), &rv); - printf ("call last_pc = " CONTEXT_REG_FMT " pc = " CONTEXT_REG_FMT " rv = " CONTEXT_REG_FMT "\n", - last_pc, pc, retaddr); - /* experimental - try to skip kernel calls for speed */ add_breakpoint (retaddr); set_step_threads (event.dwThreadId, 0); -#endif } +#else + CONTEXT_REG retaddr; + SIZE_T bytes_read; + ReadProcessMemory (hProcess, + (void *)sp, + (LPVOID)&(retaddr), + sizeof(retaddr), &bytes_read); + if (verbose) + printf ("skip kernel call: " CONTEXT_REG_FMT " -> " CONTEXT_REG_FMT ", ret = " CONTEXT_REG_FMT "\n", + last_pc, pc, retaddr); + add_breakpoint (retaddr); + set_step_threads (event.dwThreadId, 0); +#endif } } -#endif total_cycles++; last_sp = sp; last_pc = pc; +#if defined(__aarch64__) + last_lr = lr; +#endif if (pc >= low_pc && pc < high_pc) hits[(pc - low_pc)/2] ++; break; @@ -691,11 +639,12 @@ run_program (char *cmdline) dump_registers (hThread); } contv = DBG_EXCEPTION_NOT_HANDLED; - running = 0; + if (!event.u.Exception.dwFirstChance) + running = 0; break; } - if (rv) + if (!rv) { if (pc == thread_return_address[tix]) { @@ -706,9 +655,9 @@ run_program (char *cmdline) SetThreadContext (hThread, &context); } #elif defined(__aarch64__) - if (context.Cpsr & ARM64_PSR_SS) + if (context.Cpsr & 0x00200000) { - context.Cpsr &= ~ARM64_PSR_SS; + context.Cpsr &= ~0x00200000; /* PSTATE.SS (single step) flag */ SetThreadContext (hThread, &context); } #else @@ -724,9 +673,9 @@ run_program (char *cmdline) SetThreadContext (hThread, &context); } #elif defined(__aarch64__) - if (!(context.Cpsr & ARM64_PSR_SS)) + if (!(context.Cpsr & 0x00200000)) { - context.Cpsr |= ARM64_PSR_SS; + context.Cpsr |= 0x00200000; /* PSTATE.SS (single step) flag */ SetThreadContext (hThread, &context); } #else @@ -1093,10 +1042,17 @@ main (int argc, char **argv) } memset (hits, 0, range+4); - fprintf (stderr, "prun: [" CONTEXT_REG_FMT "," CONTEXT_REG_FMT "] Running '%s'\n", + fprintf (stderr, "prun: [" CONTEXT_REG_FMT "," CONTEXT_REG_FMT "] Running '%s'\n", low_pc, high_pc, argv[optind]); - - run_program (argv[optind]); + { + char *cmdline_copy = strdup (argv[optind]); + if (!cmdline_copy) + { + fprintf (stderr, "Out of memory duplicating cmdline\n"); + exit (1); + } + run_program (cmdline_copy); + } hdr.lpc = low_pc; hdr.hpc = high_pc; From 5efdf7387ce451e514b454a65fa787c9ca8e976f Mon Sep 17 00:00:00 2001 From: Mayank Agrawal Date: Mon, 18 May 2026 17:16:56 +0530 Subject: [PATCH 2/3] fix indentation and other issues --- winsup/utils/ssp.c | 87 ++++++++++++++++++++++------------------------ 1 file changed, 41 insertions(+), 46 deletions(-) diff --git a/winsup/utils/ssp.c b/winsup/utils/ssp.c index 50463dc0f..9980b8a90 100644 --- a/winsup/utils/ssp.c +++ b/winsup/utils/ssp.c @@ -41,7 +41,7 @@ static struct option longopts[] = static char opts[] = "+cdehlstvV"; -#if defined(__x86_64__) +#ifdef __x86_64__ #define KERNEL_ADDR 0x00007FF000000000 #define CONTEXT_SP Rsp #define CONTEXT_IP Rip @@ -237,14 +237,14 @@ set_step_threads (int threadId, int trace) thread_step_flags[tix] = trace; #if defined(__i386__) || defined(__x86_64__) if (trace) - context.EFlags |= 0x100; /* TRAP (single step) flag */ + context.EFlags |= 0x100; /* TRAP (single step) flag */ else - context.EFlags &= ~0x100; /* TRAP (single step) flag */ + context.EFlags &= ~0x100; /* TRAP (single step) flag */ #elif defined(__aarch64__) if (trace) - context.Cpsr |= 0x00200000; /* PSTATE.SS (single step) flag */ + context.Cpsr |= 0x00200000; /* PSTATE.SS (single step) flag */ else - context.Cpsr &= ~0x00200000; /* PSTATE.SS (single step) flag */ + context.Cpsr &= ~0x00200000; /* PSTATE.SS (single step) flag */ #else #error unimplemented for this target #endif @@ -302,17 +302,17 @@ dump_registers (HANDLE thread) { context.ContextFlags = CONTEXT_FULL; GetThreadContext (thread, &context); -#if defined(__x86_64__) +#ifdef __x86_64__ printf ("eax %016llx ebx %016llx ecx %016llx edx %016llx eip\n", context.Rax, context.Rbx, context.Rcx, context.Rdx); printf ("esi %016llx edi %016llx ebp %016llx esp %016llx %016llx\n", context.Rsi, context.Rdi, context.Rbp, context.Rsp, context.Rip); #elif defined(__aarch64__) - printf (" x0 %016llx x1 %016llx x2 %016llx x3 %016llx\n", + printf ("x0 %016llx x1 %016llx x2 %016llx x3 %016llx\n", context.X[0], context.X[1], context.X[2], context.X[3]); - printf (" x4 %016llx x5 %016llx x6 %016llx x7 %016llx\n", + printf ("x4 %016llx x5 %016llx x6 %016llx x7 %016llx\n", context.X[4], context.X[5], context.X[6], context.X[7]); - printf (" x8 %016llx x9 %016llx x10 %016llx x11 %016llx\n", + printf ("x8 %016llx x9 %016llx x10 %016llx x11 %016llx\n", context.X[8], context.X[9], context.X[10], context.X[11]); printf ("x12 %016llx x13 %016llx x14 %016llx x15 %016llx\n", context.X[12], context.X[13], context.X[14], context.X[15]); @@ -322,9 +322,9 @@ dump_registers (HANDLE thread) context.X[20], context.X[21], context.X[22], context.X[23]); printf ("x24 %016llx x25 %016llx x26 %016llx x27 %016llx\n", context.X[24], context.X[25], context.X[26], context.X[27]); - printf ("x28 %016llx fp %016llx lr %016llx\n", + printf ("x28 %016llx fp %016llx lr %016llx\n", context.X[28], context.Fp, context.Lr); - printf (" sp %016llx pc %016llx cpsr %08x\n", + printf ("sp %016llx pc %016llx cpsr %08x\n", context.Sp, context.Pc, context.Cpsr); #else #error unimplemented for this target @@ -520,10 +520,8 @@ run_program (char *cmdline) if (remove_breakpoint ((CONTEXT_REG)event.u.Exception.ExceptionRecord.ExceptionAddress)) { #if defined(__aarch64__) - /* On ARM64, PC points at the BRK instruction; no adjustment needed. */ if (!rv) SetThreadContext (hThread, &context); - /* Return address is in LR (X30), not on the stack. */ thread_return_address[tix] = context.Lr; #else context.CONTEXT_IP --; @@ -570,52 +568,45 @@ run_program (char *cmdline) { static int ncalls=0; static int qq=0; - int is_call; if (++qq % 100 == 0) fprintf (stderr, " " CONTEXT_REG_FMT " %d %d \r", pc, ncalls, opcode_count); #if defined(__aarch64__) - is_call = (lr != last_lr && lr == last_pc + 4); + if (lr != last_lr && lr == last_pc + 4) #else - is_call = (sp == last_sp-sizeof(CONTEXT_REG)); + if (sp == last_sp-sizeof(CONTEXT_REG)) #endif - if (is_call) { - ncalls++; + ncalls++; store_call_edge (last_pc, pc); - } - if (last_pc && last_pc < KERNEL_ADDR && pc > KERNEL_ADDR) - { + if (last_pc < KERNEL_ADDR && pc > KERNEL_ADDR) + { #if defined(__aarch64__) - /* On ARM64, the return address for a BL/BLR is in - LR. For a tail-call (B/BR) it isn't, but LR will - still hold the return address of whatever frame - made the original call into our code, so it's the - correct place to break to resume stepping. Place - a breakpoint there and stop single-stepping until - we return from the kernel/DLL call. */ - CONTEXT_REG retaddr = lr; - if (verbose) + CONTEXT_REG retaddr = lr; + if (verbose) printf ("skip kernel call: " CONTEXT_REG_FMT " -> " CONTEXT_REG_FMT ", ret = " CONTEXT_REG_FMT "\n", last_pc, pc, retaddr); - if (retaddr && retaddr < KERNEL_ADDR) - { + if (retaddr && retaddr < KERNEL_ADDR) + { + add_breakpoint (retaddr); + set_step_threads (event.dwThreadId, 0); + } +#else +#if 0 + CONTEXT_REG retaddr; + SIZE_T rv; + ReadProcessMemory (hProcess, + (void *)sp, + (LPVOID)&(retaddr), + sizeof(retaddr), &rv); + printf ("call last_pc = " CONTEXT_REG_FMT " pc = " CONTEXT_REG_FMT " rv = " CONTEXT_REG_FMT "\n", + last_pc, pc, retaddr); + /* experimental - try to skip kernel calls for speed */ add_breakpoint (retaddr); set_step_threads (event.dwThreadId, 0); - } -#else - CONTEXT_REG retaddr; - SIZE_T bytes_read; - ReadProcessMemory (hProcess, - (void *)sp, - (LPVOID)&(retaddr), - sizeof(retaddr), &bytes_read); - if (verbose) - printf ("skip kernel call: " CONTEXT_REG_FMT " -> " CONTEXT_REG_FMT ", ret = " CONTEXT_REG_FMT "\n", - last_pc, pc, retaddr); - add_breakpoint (retaddr); - set_step_threads (event.dwThreadId, 0); #endif +#endif + } } } @@ -639,8 +630,12 @@ run_program (char *cmdline) dump_registers (hThread); } contv = DBG_EXCEPTION_NOT_HANDLED; +#if defined(__aarch64__) if (!event.u.Exception.dwFirstChance) - running = 0; + running = 0; +#else + running = 0; +#endif break; } From a025f9f38ed8acfcaaf6b61e66f1c3aba7a84c18 Mon Sep 17 00:00:00 2001 From: Mayank Agrawal Date: Tue, 2 Jun 2026 14:49:57 +0530 Subject: [PATCH 3/3] v2 patch for SSP.C --- winsup/utils/ssp.c | 70 ++++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 36 deletions(-) diff --git a/winsup/utils/ssp.c b/winsup/utils/ssp.c index 9980b8a90..32f813961 100644 --- a/winsup/utils/ssp.c +++ b/winsup/utils/ssp.c @@ -90,13 +90,19 @@ typedef struct { char *name; } DllInfo; -typedef struct { - CONTEXT_REG address; -#if defined(__aarch64__) - unsigned char real_insn[4]; /* ARM64 instructions are 4 bytes */ +/* Size in bytes of the software breakpoint instruction (INT3 on x86, + BRK on AArch64). */ +#if defined(__i386__) || defined(__x86_64__) +#define SW_BREAKPOINT_SIZE 1 +#elif defined(__aarch64__) +#define SW_BREAKPOINT_SIZE 4 #else - unsigned char real_byte; +#error unimplemented for this target #endif + +typedef struct { + CONTEXT_REG address; + unsigned char real_insn[SW_BREAKPOINT_SIZE]; } PendingBreakpoints; CONTEXT_REG low_pc, high_pc=0; @@ -143,11 +149,12 @@ add_breakpoint (CONTEXT_REG address) { int i; SIZE_T rv; -#if defined(__aarch64__) - static unsigned char brk_insn[] = { 0x00, 0x00, 0x20, 0xd4 }; /* BRK #0, which matches MSVC's - __debugbreak recommendation on Windows/ARM64.*/ +#if defined(__i386__) || defined(__x86_64__) + static unsigned char brk_insn[] = { 0xcc }; +#elif defined(__aarch64__) + static unsigned char brk_insn[] = { 0x00, 0x00, 0x20, 0xd4 }; #else - static unsigned char int3[] = { 0xcc }; +#error unimplemented for this target #endif for (i=0; i= num_breakpoints) num_breakpoints = i+1; } @@ -190,17 +187,10 @@ remove_breakpoint (CONTEXT_REG address) if (pending_breakpoints[i].address == address) { pending_breakpoints[i].address = 0; -#if defined(__aarch64__) WriteProcessMemory (hProcess, (void *)address, pending_breakpoints[i].real_insn, - 4, &rv); -#else - WriteProcessMemory (hProcess, - (void *)address, - &(pending_breakpoints[i].real_byte), - 1, &rv); -#endif + SW_BREAKPOINT_SIZE, &rv); return 1; } } @@ -564,7 +554,7 @@ run_program (char *cmdline) } } - if (pc < last_pc || pc > last_pc+10) + if (pc < last_pc || pc > last_pc+10) { static int ncalls=0; static int qq=0; @@ -580,12 +570,12 @@ run_program (char *cmdline) ncalls++; store_call_edge (last_pc, pc); if (last_pc < KERNEL_ADDR && pc > KERNEL_ADDR) - { + { #if defined(__aarch64__) CONTEXT_REG retaddr = lr; if (verbose) - printf ("skip kernel call: " CONTEXT_REG_FMT " -> " CONTEXT_REG_FMT ", ret = " CONTEXT_REG_FMT "\n", - last_pc, pc, retaddr); + printf ("skip kernel call: " CONTEXT_REG_FMT " -> " CONTEXT_REG_FMT ", ret = " CONTEXT_REG_FMT "\n", + last_pc, pc, retaddr); if (retaddr && retaddr < KERNEL_ADDR) { add_breakpoint (retaddr); @@ -606,7 +596,7 @@ run_program (char *cmdline) set_step_threads (event.dwThreadId, 0); #endif #endif - } + } } } @@ -632,7 +622,7 @@ run_program (char *cmdline) contv = DBG_EXCEPTION_NOT_HANDLED; #if defined(__aarch64__) if (!event.u.Exception.dwFirstChance) - running = 0; + running = 0; #else running = 0; #endif @@ -1037,9 +1027,17 @@ main (int argc, char **argv) } memset (hits, 0, range+4); - fprintf (stderr, "prun: [" CONTEXT_REG_FMT "," CONTEXT_REG_FMT "] Running '%s'\n", + fprintf (stderr, "prun: [" CONTEXT_REG_FMT "," CONTEXT_REG_FMT "] Running '%s'\n", low_pc, high_pc, argv[optind]); { + /* CreateProcess (called below with lpApplicationName == NULL) is + documented to modify the lpCommandLine buffer in place. argv[optind] + points into our own argv, so passing it directly lets CreateProcess + scribble on it; this was observed on aarch64-cygwin as the command + line coming back mangled (e.g. 'test_hello.exe' -> 'st_hello.exxee') + on later use. Pass a private writable copy instead. It is not freed + because run_program() stores it in dll_info[0].name, which is read + later when printing the DLL-profile table. */ char *cmdline_copy = strdup (argv[optind]); if (!cmdline_copy) {