Merge upstream#12
Open
vickiegpt wants to merge 5 commits into
Open
Conversation
There was a problem hiding this comment.
Pull request overview
This PR brings in upstream changes that add CUDA/GPU support for mapping a daxfs DAX region into GPU address space (to enable PCIe AtomicOps coordination), plus new GPU/agent benchmarking and plotting utilities.
Changes:
- Add a new kernel ioctl (
DAXFS_IOC_GET_GPU_INFO) and corresponding userspace GPU mapping helper (tools/daxfs-gpu-map.*) for exposing DAX layout/offsets to CUDA. - Introduce CUDA-side coordination primitives (
include/daxfs_gpu.h) and a CUDA benchmark (tests/bench_gpu.cu) with plotting scripts. - Add an AI-agent speculative branching benchmark script (
tests/bench_agent.sh) and bench result TSVs.
Reviewed changes
Copilot reviewed 14 out of 35 changed files in this pull request and generated 9 comments.
Show a summary per file
| File | Description |
|---|---|
| tools/Makefile | Adds a gpu target to build the optional GPU mapping object. |
| tools/daxfs-gpu-map.h | Declares the userspace GPU mapping API and convenience accessors. |
| tools/daxfs-gpu-map.c | Implements dma-buf and /dev/mem mapping paths into CUDA device address space. |
| tests/plot_gpu.py | Adds plotting for GPU coordination, P2P DMA, and multi-tenant experiments. |
| tests/plot_bench.py | Adds plotting for agent scalability/depth/commit-cost experiments. |
| tests/cuda_compat/crt/math_functions.h | Introduces a CUDA/glibc compatibility wrapper header. |
| tests/bench_gpu.cu | Adds a CUDA benchmark exercising coordination primitives, P2P DMA, and multi-tenant ACL simulation. |
| tests/bench_agent.sh | Adds a root-run benchmark script for daxfs speculative branching vs tmpfs/overlayfs. |
| Makefile | Adds a top-level gpu make target that builds the tools GPU artifact. |
| include/daxfs_gpu.h | Adds CUDA device-inline primitives mirroring kernel coordination/page-cache protocols. |
| include/daxfs_format.h | Adds DAXFS_IOC_GET_GPU_INFO and struct daxfs_gpu_info to the shared format header. |
| daxfs/file.c | Implements the new ioctl to return physical address and offset/layout info to userspace. |
| bench_results/gpu.tsv | Adds sample GPU benchmark output. |
| bench_results/agents.tsv | Adds sample agent benchmark output. |
💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.
Comment on lines
+778
to
+806
| coord_lock); | ||
| gi.commit_seq_off = offsetof(struct daxfs_global_coord, | ||
| commit_sequence); | ||
| } | ||
|
|
||
| pcache_off = le64_to_cpu(info->super->pcache_offset); | ||
| if (pcache_off && info->pcache) { | ||
| struct daxfs_pcache_header *hdr; | ||
|
|
||
| hdr = info->pcache->header; | ||
| gi.pcache_offset = pcache_off; | ||
| gi.pcache_slots_offset = pcache_off + | ||
| le64_to_cpu(hdr->slot_meta_offset); | ||
| gi.pcache_data_offset = pcache_off + | ||
| le64_to_cpu(hdr->slot_data_offset); | ||
| gi.pcache_slot_count = info->pcache->slot_count; | ||
| gi.pcache_slot_stride = | ||
| sizeof(struct daxfs_pcache_slot); | ||
| gi.state_tag_off = offsetof(struct daxfs_pcache_slot, | ||
| state_tag); | ||
| gi.pending_count_off = | ||
| offsetof(struct daxfs_pcache_header, | ||
| pending_count); | ||
| } | ||
|
|
||
| if (copy_to_user((void __user *)arg, &gi, sizeof(gi))) | ||
| return -EFAULT; | ||
| return 0; | ||
| } |
Comment on lines
+118
to
+125
| addr = mmap(NULL, gpu->info.dax_size, PROT_READ | PROT_WRITE, | ||
| MAP_SHARED, fd, (off_t)gpu->info.dax_phys_addr); | ||
| close(fd); | ||
|
|
||
| if (addr == MAP_FAILED) { | ||
| perror("daxfs-gpu-map: mmap /dev/mem"); | ||
| return -1; | ||
| } |
Comment on lines
+164
to
+176
| fprintf(stderr, "daxfs-gpu-map: dmabuf_mmap: fd=%d size=%llu\n", | ||
| dmabuf_fd, (unsigned long long)gpu->info.dax_size); | ||
|
|
||
| addr = mmap(NULL, gpu->info.dax_size, PROT_READ | PROT_WRITE, | ||
| MAP_SHARED, dmabuf_fd, 0); | ||
| if (addr == MAP_FAILED) { | ||
| perror("daxfs-gpu-map: mmap dma-buf fd"); | ||
| close(dmabuf_fd); | ||
| return -1; | ||
| } | ||
|
|
||
| fprintf(stderr, "daxfs-gpu-map: dmabuf mmap'd at %p\n", addr); | ||
|
|
Comment on lines
+17
to
+26
| #include <cuda.h> | ||
| #include <stdint.h> | ||
| #include "daxfs_format.h" | ||
|
|
||
| struct daxfs_gpu_map { | ||
| /* GPU mapping */ | ||
| CUdeviceptr base; /* GPU pointer to DAX base */ | ||
| CUexternalMemory ext_mem; /* Handle (dma-buf path) */ | ||
| size_t size; /* Total mapped size */ | ||
|
|
Comment on lines
+21
to
+35
| #ifdef __CUDACC__ | ||
|
|
||
| /* Re-export the state/tag helpers so GPU code matches kernel conventions. | ||
| * Guarded so this header can coexist with daxfs_format.h. */ | ||
| #ifndef PCACHE_STATE_FREE | ||
| #define PCACHE_STATE_FREE 0 | ||
| #define PCACHE_STATE_PENDING 1 | ||
| #define PCACHE_STATE_VALID 2 | ||
| #endif | ||
|
|
||
| #ifndef PCACHE_STATE | ||
| #define PCACHE_STATE(v) ((v) & 3ULL) | ||
| #define PCACHE_TAG(v) ((v) >> 2) | ||
| #define PCACHE_MAKE(state, tag) (((unsigned long long)(tag) << 2) | (state)) | ||
| #endif |
Comment on lines
+238
to
+248
|
|
||
| for (int i = 0; i < iters; i++) { | ||
| /* Explicit GPU load from host (PCIe read TLPs) */ | ||
| for (unsigned int b = 0; b < xfer_size; b += sizeof(unsigned long long)) { | ||
| unsigned long long val = | ||
| *(volatile const unsigned long long *)(src + b); | ||
| /* Store to device mem so compiler doesn't elide the load */ | ||
| if (dev_dst) | ||
| *(unsigned long long *)(dev_dst + b) = val; | ||
| } | ||
| } |
Comment on lines
+1
to
+15
| /* Local wrapper to fix glibc 2.41+ / CUDA rsqrt noexcept conflict */ | ||
| #ifndef _DAXFS_MATH_FUNCTIONS_COMPAT_H | ||
| #define _DAXFS_MATH_FUNCTIONS_COMPAT_H | ||
|
|
||
| /* Include the real CUDA header first */ | ||
| #include_next <crt/math_functions.h> | ||
|
|
||
| /* Now suppress the glibc redeclarations by pre-declaring them | ||
| in a compatible way before glibc's math.h gets pulled in. */ | ||
| #ifdef __cplusplus | ||
| extern "C" { | ||
| #endif | ||
|
|
||
| /* Override glibc's rsqrt/rsqrtf declarations to avoid noexcept mismatch. | ||
| We define them as weak aliases so they don't conflict. */ |
Comment on lines
+20
to
+25
| CUDA_CFLAGS = -I../include -I$(CUDA_PATH)/include | ||
| CUDA_LDFLAGS = -L$(CUDA_PATH)/lib64 -lcuda | ||
|
|
||
| GPU_TARGETS = daxfs-gpu-map.o | ||
|
|
||
| gpu: $(GPU_TARGETS) |
Comment on lines
+121
to
+146
| sizes = [64, 256, 4096, 65536, 1 << 20] | ||
| labels = ['64B', '256B', '4KB', '64KB', '1MB'] | ||
|
|
||
| for exp, label, color, marker in [ | ||
| ('gpu_p2p_read', 'GPU Read from CXL', '#2196F3', 'o'), | ||
| ('gpu_p2p_write', 'GPU Write to CXL', '#FF5722', 's'), | ||
| ]: | ||
| sub = df[df['experiment'] == exp] | ||
| if len(sub) == 0: continue | ||
| # thru_mops is actually GB/s * 1000 in this context | ||
| means, stds_v = [], [] | ||
| for sz in sizes: | ||
| s = sub[sub['parameter'] == sz] | ||
| if len(s) == 0: continue | ||
| # Compute GB/s from latency and size | ||
| lat = s['latency_ns'].mean() | ||
| bw = sz / lat # bytes/ns = GB/s | ||
| means.append(bw) | ||
| stds_v.append(0) | ||
| if means: | ||
| ax.plot(range(len(means)), means, marker=marker, color=color, | ||
| linewidth=2, markersize=7, label=label) | ||
|
|
||
| ax.set_xticks(range(len(labels))) | ||
| ax.set_xticklabels(labels) | ||
| ax.set_xlabel('Transfer Size') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
No description provided.