Merge upstream by vickiegpt · Pull Request #12 · multikernel/daxfs

vickiegpt · 2026-05-19T21:48:35Z

No description provided.

Copilot

Pull request overview

This PR brings in upstream changes that add CUDA/GPU support for mapping a daxfs DAX region into GPU address space (to enable PCIe AtomicOps coordination), plus new GPU/agent benchmarking and plotting utilities.

Changes:

Add a new kernel ioctl (DAXFS_IOC_GET_GPU_INFO) and corresponding userspace GPU mapping helper (tools/daxfs-gpu-map.*) for exposing DAX layout/offsets to CUDA.
Introduce CUDA-side coordination primitives (include/daxfs_gpu.h) and a CUDA benchmark (tests/bench_gpu.cu) with plotting scripts.
Add an AI-agent speculative branching benchmark script (tests/bench_agent.sh) and bench result TSVs.

Reviewed changes

Copilot reviewed 14 out of 35 changed files in this pull request and generated 9 comments.

Show a summary per file

File	Description
tools/Makefile	Adds a `gpu` target to build the optional GPU mapping object.
tools/daxfs-gpu-map.h	Declares the userspace GPU mapping API and convenience accessors.
tools/daxfs-gpu-map.c	Implements dma-buf and `/dev/mem` mapping paths into CUDA device address space.
tests/plot_gpu.py	Adds plotting for GPU coordination, P2P DMA, and multi-tenant experiments.
tests/plot_bench.py	Adds plotting for agent scalability/depth/commit-cost experiments.
tests/cuda_compat/crt/math_functions.h	Introduces a CUDA/glibc compatibility wrapper header.
tests/bench_gpu.cu	Adds a CUDA benchmark exercising coordination primitives, P2P DMA, and multi-tenant ACL simulation.
tests/bench_agent.sh	Adds a root-run benchmark script for daxfs speculative branching vs tmpfs/overlayfs.
Makefile	Adds a top-level `gpu` make target that builds the tools GPU artifact.
include/daxfs_gpu.h	Adds CUDA device-inline primitives mirroring kernel coordination/page-cache protocols.
include/daxfs_format.h	Adds `DAXFS_IOC_GET_GPU_INFO` and `struct daxfs_gpu_info` to the shared format header.
daxfs/file.c	Implements the new ioctl to return physical address and offset/layout info to userspace.
bench_results/gpu.tsv	Adds sample GPU benchmark output.
bench_results/agents.tsv	Adds sample agent benchmark output.

💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.

+						     coord_lock);
+			gi.commit_seq_off = offsetof(struct daxfs_global_coord,
+						     commit_sequence);
+		}
+
+		pcache_off = le64_to_cpu(info->super->pcache_offset);
+		if (pcache_off && info->pcache) {
+			struct daxfs_pcache_header *hdr;
+
+			hdr = info->pcache->header;
+			gi.pcache_offset = pcache_off;
+			gi.pcache_slots_offset = pcache_off +
+				le64_to_cpu(hdr->slot_meta_offset);
+			gi.pcache_data_offset = pcache_off +
+				le64_to_cpu(hdr->slot_data_offset);
+			gi.pcache_slot_count = info->pcache->slot_count;
+			gi.pcache_slot_stride =
+				sizeof(struct daxfs_pcache_slot);
+			gi.state_tag_off = offsetof(struct daxfs_pcache_slot,
+						    state_tag);
+			gi.pending_count_off =
+				offsetof(struct daxfs_pcache_header,
+					 pending_count);
+		}
+
+		if (copy_to_user((void __user *)arg, &gi, sizeof(gi)))
+			return -EFAULT;
+		return 0;
+	}


+	addr = mmap(NULL, gpu->info.dax_size, PROT_READ | PROT_WRITE,
+		    MAP_SHARED, fd, (off_t)gpu->info.dax_phys_addr);
+	close(fd);
+
+	if (addr == MAP_FAILED) {
+		perror("daxfs-gpu-map: mmap /dev/mem");
+		return -1;
+	}


+	fprintf(stderr, "daxfs-gpu-map: dmabuf_mmap: fd=%d size=%llu\n",
+		dmabuf_fd, (unsigned long long)gpu->info.dax_size);
+
+	addr = mmap(NULL, gpu->info.dax_size, PROT_READ | PROT_WRITE,
+		    MAP_SHARED, dmabuf_fd, 0);
+	if (addr == MAP_FAILED) {
+		perror("daxfs-gpu-map: mmap dma-buf fd");
+		close(dmabuf_fd);
+		return -1;
+	}
+
+	fprintf(stderr, "daxfs-gpu-map: dmabuf mmap'd at %p\n", addr);
+


+#include <cuda.h>
+#include <stdint.h>
+#include "daxfs_format.h"
+
+struct daxfs_gpu_map {
+	/* GPU mapping */
+	CUdeviceptr			base;		/* GPU pointer to DAX base */
+	CUexternalMemory		ext_mem;	/* Handle (dma-buf path) */
+	size_t				size;		/* Total mapped size */
+


+#ifdef __CUDACC__
+
+/* Re-export the state/tag helpers so GPU code matches kernel conventions.
+ * Guarded so this header can coexist with daxfs_format.h. */
+#ifndef PCACHE_STATE_FREE
+#define PCACHE_STATE_FREE    0
+#define PCACHE_STATE_PENDING 1
+#define PCACHE_STATE_VALID   2
+#endif
+
+#ifndef PCACHE_STATE
+#define PCACHE_STATE(v)        ((v) & 3ULL)
+#define PCACHE_TAG(v)          ((v) >> 2)
+#define PCACHE_MAKE(state, tag) (((unsigned long long)(tag) << 2) | (state))
+#endif


+
+    for (int i = 0; i < iters; i++) {
+        /* Explicit GPU load from host (PCIe read TLPs) */
+        for (unsigned int b = 0; b < xfer_size; b += sizeof(unsigned long long)) {
+            unsigned long long val =
+                *(volatile const unsigned long long *)(src + b);
+            /* Store to device mem so compiler doesn't elide the load */
+            if (dev_dst)
+                *(unsigned long long *)(dev_dst + b) = val;
+        }
+    }


+/* Local wrapper to fix glibc 2.41+ / CUDA rsqrt noexcept conflict */
+#ifndef _DAXFS_MATH_FUNCTIONS_COMPAT_H
+#define _DAXFS_MATH_FUNCTIONS_COMPAT_H
+
+/* Include the real CUDA header first */
+#include_next <crt/math_functions.h>
+
+/* Now suppress the glibc redeclarations by pre-declaring them
+   in a compatible way before glibc's math.h gets pulled in. */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Override glibc's rsqrt/rsqrtf declarations to avoid noexcept mismatch.
+   We define them as weak aliases so they don't conflict. */


+CUDA_CFLAGS = -I../include -I$(CUDA_PATH)/include
+CUDA_LDFLAGS = -L$(CUDA_PATH)/lib64 -lcuda
+
+GPU_TARGETS = daxfs-gpu-map.o
+
+gpu: $(GPU_TARGETS)


+    sizes = [64, 256, 4096, 65536, 1 << 20]
+    labels = ['64B', '256B', '4KB', '64KB', '1MB']
+
+    for exp, label, color, marker in [
+        ('gpu_p2p_read', 'GPU Read from CXL', '#2196F3', 'o'),
+        ('gpu_p2p_write', 'GPU Write to CXL', '#FF5722', 's'),
+    ]:
+        sub = df[df['experiment'] == exp]
+        if len(sub) == 0: continue
+        # thru_mops is actually GB/s * 1000 in this context
+        means, stds_v = [], []
+        for sz in sizes:
+            s = sub[sub['parameter'] == sz]
+            if len(s) == 0: continue
+            # Compute GB/s from latency and size
+            lat = s['latency_ns'].mean()
+            bw = sz / lat  # bytes/ns = GB/s
+            means.append(bw)
+            stds_v.append(0)
+        if means:
+            ax.plot(range(len(means)), means, marker=marker, color=color,
+                    linewidth=2, markersize=7, label=label)
+
+    ax.set_xticks(range(len(labels)))
+    ax.set_xticklabels(labels)
+    ax.set_xlabel('Transfer Size')


Yuy1L1 and others added 4 commits February 25, 2026 21:46

update

99bf3d1

add daxfs agent bench.

c3145b6

update

09ee1a2

update

3fe00db

Copilot AI review requested due to automatic review settings May 19, 2026 21:48

Merge branch 'main' into main

bebd628

Copilot started reviewing on behalf of vickiegpt May 19, 2026 21:49 View session

Copilot AI reviewed May 19, 2026

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Merge upstream#12

Merge upstream#12
vickiegpt wants to merge 5 commits into
multikernel:mainfrom
vickiegpt:main

vickiegpt commented May 19, 2026

Uh oh!

Copilot AI left a comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

Conversation

vickiegpt commented May 19, 2026

Uh oh!

Copilot AI left a comment

Choose a reason for hiding this comment

Pull request overview

Reviewed changes

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants