Skip to content

Merge upstream#12

Open
vickiegpt wants to merge 5 commits into
multikernel:mainfrom
vickiegpt:main
Open

Merge upstream#12
vickiegpt wants to merge 5 commits into
multikernel:mainfrom
vickiegpt:main

Conversation

@vickiegpt
Copy link
Copy Markdown

No description provided.

Copilot AI review requested due to automatic review settings May 19, 2026 21:48
Copy link
Copy Markdown

Copilot AI left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pull request overview

This PR brings in upstream changes that add CUDA/GPU support for mapping a daxfs DAX region into GPU address space (to enable PCIe AtomicOps coordination), plus new GPU/agent benchmarking and plotting utilities.

Changes:

  • Add a new kernel ioctl (DAXFS_IOC_GET_GPU_INFO) and corresponding userspace GPU mapping helper (tools/daxfs-gpu-map.*) for exposing DAX layout/offsets to CUDA.
  • Introduce CUDA-side coordination primitives (include/daxfs_gpu.h) and a CUDA benchmark (tests/bench_gpu.cu) with plotting scripts.
  • Add an AI-agent speculative branching benchmark script (tests/bench_agent.sh) and bench result TSVs.

Reviewed changes

Copilot reviewed 14 out of 35 changed files in this pull request and generated 9 comments.

Show a summary per file
File Description
tools/Makefile Adds a gpu target to build the optional GPU mapping object.
tools/daxfs-gpu-map.h Declares the userspace GPU mapping API and convenience accessors.
tools/daxfs-gpu-map.c Implements dma-buf and /dev/mem mapping paths into CUDA device address space.
tests/plot_gpu.py Adds plotting for GPU coordination, P2P DMA, and multi-tenant experiments.
tests/plot_bench.py Adds plotting for agent scalability/depth/commit-cost experiments.
tests/cuda_compat/crt/math_functions.h Introduces a CUDA/glibc compatibility wrapper header.
tests/bench_gpu.cu Adds a CUDA benchmark exercising coordination primitives, P2P DMA, and multi-tenant ACL simulation.
tests/bench_agent.sh Adds a root-run benchmark script for daxfs speculative branching vs tmpfs/overlayfs.
Makefile Adds a top-level gpu make target that builds the tools GPU artifact.
include/daxfs_gpu.h Adds CUDA device-inline primitives mirroring kernel coordination/page-cache protocols.
include/daxfs_format.h Adds DAXFS_IOC_GET_GPU_INFO and struct daxfs_gpu_info to the shared format header.
daxfs/file.c Implements the new ioctl to return physical address and offset/layout info to userspace.
bench_results/gpu.tsv Adds sample GPU benchmark output.
bench_results/agents.tsv Adds sample agent benchmark output.

💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.

Comment thread daxfs/file.c
Comment on lines +778 to +806
coord_lock);
gi.commit_seq_off = offsetof(struct daxfs_global_coord,
commit_sequence);
}

pcache_off = le64_to_cpu(info->super->pcache_offset);
if (pcache_off && info->pcache) {
struct daxfs_pcache_header *hdr;

hdr = info->pcache->header;
gi.pcache_offset = pcache_off;
gi.pcache_slots_offset = pcache_off +
le64_to_cpu(hdr->slot_meta_offset);
gi.pcache_data_offset = pcache_off +
le64_to_cpu(hdr->slot_data_offset);
gi.pcache_slot_count = info->pcache->slot_count;
gi.pcache_slot_stride =
sizeof(struct daxfs_pcache_slot);
gi.state_tag_off = offsetof(struct daxfs_pcache_slot,
state_tag);
gi.pending_count_off =
offsetof(struct daxfs_pcache_header,
pending_count);
}

if (copy_to_user((void __user *)arg, &gi, sizeof(gi)))
return -EFAULT;
return 0;
}
Comment thread tools/daxfs-gpu-map.c
Comment on lines +118 to +125
addr = mmap(NULL, gpu->info.dax_size, PROT_READ | PROT_WRITE,
MAP_SHARED, fd, (off_t)gpu->info.dax_phys_addr);
close(fd);

if (addr == MAP_FAILED) {
perror("daxfs-gpu-map: mmap /dev/mem");
return -1;
}
Comment thread tools/daxfs-gpu-map.c
Comment on lines +164 to +176
fprintf(stderr, "daxfs-gpu-map: dmabuf_mmap: fd=%d size=%llu\n",
dmabuf_fd, (unsigned long long)gpu->info.dax_size);

addr = mmap(NULL, gpu->info.dax_size, PROT_READ | PROT_WRITE,
MAP_SHARED, dmabuf_fd, 0);
if (addr == MAP_FAILED) {
perror("daxfs-gpu-map: mmap dma-buf fd");
close(dmabuf_fd);
return -1;
}

fprintf(stderr, "daxfs-gpu-map: dmabuf mmap'd at %p\n", addr);

Comment thread tools/daxfs-gpu-map.h
Comment on lines +17 to +26
#include <cuda.h>
#include <stdint.h>
#include "daxfs_format.h"

struct daxfs_gpu_map {
/* GPU mapping */
CUdeviceptr base; /* GPU pointer to DAX base */
CUexternalMemory ext_mem; /* Handle (dma-buf path) */
size_t size; /* Total mapped size */

Comment thread include/daxfs_gpu.h
Comment on lines +21 to +35
#ifdef __CUDACC__

/* Re-export the state/tag helpers so GPU code matches kernel conventions.
* Guarded so this header can coexist with daxfs_format.h. */
#ifndef PCACHE_STATE_FREE
#define PCACHE_STATE_FREE 0
#define PCACHE_STATE_PENDING 1
#define PCACHE_STATE_VALID 2
#endif

#ifndef PCACHE_STATE
#define PCACHE_STATE(v) ((v) & 3ULL)
#define PCACHE_TAG(v) ((v) >> 2)
#define PCACHE_MAKE(state, tag) (((unsigned long long)(tag) << 2) | (state))
#endif
Comment thread tests/bench_gpu.cu
Comment on lines +238 to +248

for (int i = 0; i < iters; i++) {
/* Explicit GPU load from host (PCIe read TLPs) */
for (unsigned int b = 0; b < xfer_size; b += sizeof(unsigned long long)) {
unsigned long long val =
*(volatile const unsigned long long *)(src + b);
/* Store to device mem so compiler doesn't elide the load */
if (dev_dst)
*(unsigned long long *)(dev_dst + b) = val;
}
}
Comment on lines +1 to +15
/* Local wrapper to fix glibc 2.41+ / CUDA rsqrt noexcept conflict */
#ifndef _DAXFS_MATH_FUNCTIONS_COMPAT_H
#define _DAXFS_MATH_FUNCTIONS_COMPAT_H

/* Include the real CUDA header first */
#include_next <crt/math_functions.h>

/* Now suppress the glibc redeclarations by pre-declaring them
in a compatible way before glibc's math.h gets pulled in. */
#ifdef __cplusplus
extern "C" {
#endif

/* Override glibc's rsqrt/rsqrtf declarations to avoid noexcept mismatch.
We define them as weak aliases so they don't conflict. */
Comment thread tools/Makefile
Comment on lines +20 to +25
CUDA_CFLAGS = -I../include -I$(CUDA_PATH)/include
CUDA_LDFLAGS = -L$(CUDA_PATH)/lib64 -lcuda

GPU_TARGETS = daxfs-gpu-map.o

gpu: $(GPU_TARGETS)
Comment thread tests/plot_gpu.py
Comment on lines +121 to +146
sizes = [64, 256, 4096, 65536, 1 << 20]
labels = ['64B', '256B', '4KB', '64KB', '1MB']

for exp, label, color, marker in [
('gpu_p2p_read', 'GPU Read from CXL', '#2196F3', 'o'),
('gpu_p2p_write', 'GPU Write to CXL', '#FF5722', 's'),
]:
sub = df[df['experiment'] == exp]
if len(sub) == 0: continue
# thru_mops is actually GB/s * 1000 in this context
means, stds_v = [], []
for sz in sizes:
s = sub[sub['parameter'] == sz]
if len(s) == 0: continue
# Compute GB/s from latency and size
lat = s['latency_ns'].mean()
bw = sz / lat # bytes/ns = GB/s
means.append(bw)
stds_v.append(0)
if means:
ax.plot(range(len(means)), means, marker=marker, color=color,
linewidth=2, markersize=7, label=label)

ax.set_xticks(range(len(labels)))
ax.set_xticklabels(labels)
ax.set_xlabel('Transfer Size')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants