diff --git a/CMakeLists.txt b/CMakeLists.txt index 2586106b69..054cf32921 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -384,6 +384,9 @@ install (EXPORT OIIO_EXPORTED_TARGETS if (PROJECT_IS_TOP_LEVEL AND BUILD_TESTING AND ${PROJ_NAME}_BUILD_TESTS) + if (NOT BUILD_OIIOUTIL_ONLY) + add_subdirectory (testsuite/texture-device) + endif () oiio_setup_test_data() oiio_add_all_tests() endif () diff --git a/src/cmake/testing.cmake b/src/cmake/testing.cmake index 93aa7345e6..4ef7e2f33e 100644 --- a/src/cmake/testing.cmake +++ b/src/cmake/testing.cmake @@ -170,6 +170,7 @@ macro (oiio_add_all_tests) cmake-consumer cryptomatte docs-examples-cpp + texture-device iinfo igrep nonwhole-tiles oiiotool diff --git a/testsuite/texture-device/CMakeLists.txt b/testsuite/texture-device/CMakeLists.txt new file mode 100644 index 0000000000..dc33933f01 --- /dev/null +++ b/testsuite/texture-device/CMakeLists.txt @@ -0,0 +1,78 @@ +# Copyright Contributors to the OpenImageIO project. +# SPDX-License-Identifier: Apache-2.0 +# https://github.com/AcademySoftwareFoundation/OpenImageIO + +cmake_minimum_required (VERSION 3.15) + +if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) + message (FATAL_ERROR + "texture-device must be configured from OpenImageIO top-level CMake") +endif () + +if (NOT CMAKE_BUILD_TYPE) + set (CMAKE_BUILD_TYPE "Release") +endif () + +set (CMAKE_CXX_STANDARD 17 CACHE STRING "C++ standard to prefer (17, 20, etc.)") +set (CMAKE_CXX_STANDARD_REQUIRED ON) +set (CMAKE_CXX_EXTENSIONS OFF) + +if (TARGET OpenImageIO AND NOT TARGET OpenImageIO::OpenImageIO) + add_library (OpenImageIO::OpenImageIO ALIAS OpenImageIO) +endif () + +if (NOT TARGET OpenImageIO::OpenImageIO) + message (FATAL_ERROR + "Missing target OpenImageIO::OpenImageIO; configure from OIIO root") +endif () +if (NOT TARGET Imath::Imath) + message (FATAL_ERROR + "Missing target Imath::Imath; configure from OIIO root") +endif () + +# Special for OIIO testsuite when running in sanitize mode +if (DEFINED ENV{SANITIZE}) + add_compile_options (-fsanitize=$ENV{SANITIZE}) + add_link_options (-fsanitize=$ENV{SANITIZE}) +endif() + +add_executable (texture-device + arena.cpp + blend.cpp + host.cpp + texture_loader.cpp + device_tests.cpp +) +target_link_libraries (texture-device + PRIVATE OpenImageIO::OpenImageIO Imath::Imath) + +if (MSVC) + target_compile_options (texture-device PRIVATE /utf-8) +endif () + +# Compile+link gate for code that is intended to remain device-safe and +# header-only. This target must build without linking against OpenImageIO +# libraries, so unresolved symbols are detected at link time. +add_executable (texture-device-devicecheck + arena.cpp + blend.cpp + device_tests.cpp + devicecheck_main.cpp +) +if (MSVC) + target_compile_options (texture-device-devicecheck PRIVATE /utf-8) + # devicecheck intentionally does not link OIIO libs; avoid dllimport on + # inline OIIO types (like ustringhash) to prevent unresolved imports. + target_compile_definitions (texture-device-devicecheck PRIVATE OIIO_STATIC_DEFINE) +endif () +target_include_directories (texture-device-devicecheck + PRIVATE + $ +) + +if (TARGET Imath::Imath) + target_include_directories (texture-device-devicecheck + PRIVATE + $ + ) +endif () diff --git a/testsuite/texture-device/README.md b/testsuite/texture-device/README.md new file mode 100644 index 0000000000..270ff80a4d --- /dev/null +++ b/testsuite/texture-device/README.md @@ -0,0 +1,410 @@ +# A GPU texture system for a mock-up device + +This directory contains a prototype texture system for code that must be safe in a +GPU-like kernel environment. The "device" in this test is a mock implementation, +but the code is structured as if host and device memory were separate, with +explicit copies between them. The demo operation is a simple blend + blur of two +input images: + +| Input A | Input B | Operation on device | Result | +|---------|---------|---------------------|--------| +| ![grid input](img/grid.jpg) | ![checker input](img/checker.jpg) | blur + blend | ![output result](img/result.jpg) | + +The goal is to validate a manager/managed architecture for texture lookup, +missing-resource requests, and retry-based execution. The host launches a kernel, +the kernel records missing textures or tiles, and the host resolves those requests +and relaunches until all required data is resident. + +The test is intentionally scalar and pragmatic. It prioritizes architecture, +correctness, and integration points over peak performance. + +# The Arena and tagged_ptr framework + +`Host` and `MockDevice` are the concrete arenas; both implement `alloc/free`, +`copy_to/copy_from`, and `copy_in`. `Host` tracks allocations and validates frees; +`MockDevice` does the same and also implements `run()` for kernel dispatch. + +`tagged_ptr` carries a lightweight context tag with every pointer. Dereference +checks the tag against the active execution context and aborts on mismatch — +catching accidental cross-boundary access without runtime overhead. + +## Optional CUDA arena sketch + +`arena.h`/`arena.cpp` include a guidance-only `CudaArena`. + +- Active only when CUDA headers are present (`TEXTURE_DEVICE_HAS_CUDA_RUNTIME`). +- Live CUDA calls require explicit opt-in (`TEXTURE_DEVICE_ENABLE_CUDA_SKETCH_IMPL=1`). +- `Atomic` wraps CUDA atomics under `__CUDA_ARCH__`, and `std::atomic` on host. + +The runtime still uses `MockDevice`; `CudaArena` is a future integration sketch. + +# The main loop + +The launch model is fail-and-retry: + +```cpp +// copy_to/copy_from take tagged_ptr<[const] void> to enforce context safety; +// the snippet uses &op as shorthand for tagged_ptr(&op, "Host"). +for (int pass = 0; pass < max_passes; ++pass) { + textures.begin_launch(); + device.copy_to(device_op, &op, sizeof(op)); + device.run(width, height, &blend_kernel, device_op); + device.copy_from(&op, device_op, sizeof(op)); + + textures.sync_from_managed(); + + if (textures.needs_retry()) { + textures.sync_to_managed(); + continue; // queue overflow path: recollect requests + } + + if (!textures.failures()) + break; // converged: all resources resident + + textures.process_requests([&](const Request& req, + DTextureSystem* ts) { + return loader.process_request(req, ts); + }); + textures.sync_to_managed(); +} +``` + +## Pass sequence at a glance + +```mermaid +sequenceDiagram + participant H as Host + participant D as MockDevice + participant K as blend_kernel + participant T as DTextureSystem + participant L as TextureLoader + + H->>D: copy_to(op) + H->>D: run(width, height, blend_kernel) + D->>K: execute pixels + K->>T: lookup(name_a/name_b) + T-->>K: sample or record request + H->>T: sync_from_managed() + alt queue overflow + H->>T: needs_retry() grows + clears queue + H->>T: sync_to_managed() + H->>D: relaunch pass + else normal request handling + H->>T: process_requests(...) + H->>L: process_request(req) + L-->>T: set_texture_ready / set_tile_payload + H->>T: sync_to_managed() + end +``` + +## Our example blend kernel + +```cpp +void blend_kernel(int x, int y, tagged_ptr data) +{ + tagged_ptr op(data); + const float resx = static_cast(op->width); + const float resy = static_cast(op->height); + const float invx = 1.0f / resx; + const float invy = 1.0f / resy; + const float u = (static_cast(x) + 0.5f) * invx; + const float v = (static_cast(y) + 0.5f) * invy; + + Vec2 duA { invx, 0.0f }; + Vec2 dvA { 0.0f, invy }; + Vec2 duB = 4.0f * duA; + Vec2 dvB = 4.0f * dvA; + + RGBA A = op->texture_system.lookup(op->name_a, u, v, duA, dvA); + RGBA B = op->texture_system.lookup(op->name_b, u, v, duB, dvB); + + if (op->texture_system.failures()) + return; + + op->output_buffer[y * op->width + x] = 0.5f * A + 0.5f * B; +} +``` + +# The manager/managed data container pattern + +The core type is DTextureSystem. + +- Managed form (device view): DTextureSystem +- Manager form (host orchestration): DTextureSystem + +Below is a sketch of how the managed mirror is constructed, synced, and transferred to the device: +```cpp + Host host; + MockDevice device; + tagged_ptr dptr = + device.alloc(sizeof(DTextureSystem), ""); + DTextureSystem texsys_gpu(device); + DTextureSystem texsys_host(host, device, texsys_gpu); + ... + texsys_host.sync_to_managed(); + device.copy_to(dptr, tagged_ptr(&texsys_host, "Host"), + sizeof(DTextureSystem)); +``` + +Both forms are instantiated from the same template, but they expose different +APIs and fields through type-driven conditional declarations. + +- Method availability is gated with `std::enable_if` helpers (the `OPT_*` + macros), so host-only orchestration methods exist only when + `IsManager == true`. +- Optional fields are expressed with `std::conditional_t`, allowing one + definition to carry manager-only state (for example, a reference to the + managed instance) without duplicating the whole class. + +This gives one shared data layout model and one lookup code path while still +enforcing that device-side instances do not expose host-only operations. + +### Composition + +`DTextureSystem` is mostly composition of smaller manager/managed containers: + +- `RequestQueue` (`ClosedHashMap`) +- `TextureMap` (`ClosedHashMap`) +- `TileIndexMap` (`ClosedHashMap`) +- `TilePool` (`Stream`) +- POD arrays/counters (`m_textures`, counts, failure flags) + +`sync_to_managed()` calls each sub-component's own sync, then copies +`DTextureSystem`'s plain fields (flags, records, counters). `sync_from_managed()` +pulls request and failure state back in the reverse order. + +### Manager and managed ownership + +```mermaid +flowchart LR + subgraph HostSide[Manager side on Host] + HM[DTextureSystem] + HMap[TextureMap] + HTile[TileIndexMap] + HPool[TilePool Stream] + HReq[RequestQueue] + HM --> HMap + HM --> HTile + HM --> HPool + HM --> HReq + end + + subgraph DeviceSide[Managed side on MockDevice] + DM[DTextureSystem] + DMap[TextureMap] + DTile[TileIndexMap] + DPool[TilePool Stream] + DReq[RequestQueue] + DM --> DMap + DM --> DTile + DM --> DPool + DM --> DReq + end + + HM -- sync_to_managed --> DM + DM -- sync_from_managed --> HM +``` + +## ClosedHashMap + +ClosedHashMap is a pointerless open-addressed hash map with linear probing. + +Design intent: + +- Device-readable and device-writable in place. +- Manager can grow and rehash as needed. +- Atomic slot state transitions support concurrent insertion. + +## Stream + +Stream is a paged container used as a sidecar pool for large payloads (tile pixel +data). Instead of storing large values directly in a hash map, the map stores an +index and Stream stores the actual records. + +Key properties in this prototype: + +- Manager appends records with push_back. +- Managed side reads by index. +- Data is synchronized by sync_to_managed() from manager. +- Each page holds as many elements as needed for a good page size (64 Mb). + +### Stream page ownership model + +The manager keeps a writable staging page for `push_back()` (effectively the +current tail page). The managed side holds the full page table and reads all +resident pages as immutable data during lookup. + +```mermaid +flowchart LR + direction LR + subgraph Manager[Manager Stream on Host] + direction LR + MLast[page N] + MPush[push_back] + MPush --> MLast + end + + subgraph Managed[Managed Stream on Device read only] + direction LR + DN[page N] + D1[page 1] + D0[page 0] + end + DRead[operator [i]] + DRead --> D0 + DRead --> D1 + DRead --> DN + + MLast -- sync tail page --> DN +``` + +Current limitation: tile payloads are fixed to 64×64 records. + +This separation keeps indexing structures compact while allowing larger payload +storage to grow independently. + +## DTextureSystem + +DTextureSystem composes the containers above into a minimal texture runtime and +provides the device-side `lookup()` and the manager-side orchestration API. + +### Lookup pipeline (code-level) + +At a high level, `lookup(name, u, v, du, dv, rnd)` runs this sequence +(`rnd` defaults to `-1` for deterministic behavior): + +1. Resolve `TextureRecord` from `TextureMap`. +2. Build `MipAnisoFilter(texture, du, dv)`. +3. For each selected mip level: + - Generate weighted taps (`generate_samples`) for that mip. + - Resolve taps to resident tiles via `load_tiles(...)`. + - If taps are resident, accumulate weighted texels from `TilePool`. +4. If any mip had missing taps, return debug magenta for that pass. +5. Otherwise return the accumulated filtered color. + +The important split is: +- `generate_samples(...)` computes *what* to sample. +- `load_tiles(...)` resolves *where* those samples live (or enqueues requests). +- The accumulation loop computes the final color. + +### Anisotropic filtering in this prototype + +The filtering path is scalar and follows the OIIO-style helper flow, but with a +pragmatic bilinear-tap implementation: + +1. `ellipse_axes(texture, du, dv)` + - Converts derivatives to texture-space footprint radii. + - Chooses major/minor axis and normalized axis direction. + +2. `compute_miplevels(texture, axes)` + - Uses the minor footprint radius to compute LOD. + - Selects `mip0`, `mip1`, and blend factor (`mip_blend`) for trilinear + blending. + +3. `compute_ellipse_sampling(texture, axes, mip)` + - Computes anisotropy aspect ratio and clamps sample count to + `kMaxSamples`. + - Places samples along the major axis span in UV space. + +4. `generate_samples(...)` + - For each anisotropic position, computes a Gaussian-like weight. + - Expands each position into 4 bilinear taps (`x0/x1`, `y0/y1`). + - Wraps coordinates, derives tile coords + local pixel coords, emits taps. + +5. Accumulation + - For each mip, accumulate `mip_weight * sample_weight * texel`. + - Blend across mips via the trilinear `mip_blend` weight. + +Experimental stochastic path (when `rnd >= 0`): + +- Stochastically selects one mip level instead of blending two. +- Stochastically selects one tap from the generated sample set. +- The default runtime path remains deterministic; the blend kernel + calls `lookup(...)` without supplying `rnd`. + +So the current implementation is "anisotropic sample placement + bilinear texel +fetches + trilinear mip blend". It is intentionally simpler than full EWA area +integration, but it preserves the key footprint-driven behavior for this test. + +## Request lifecycle (one miss to one retry) + +This is the most important runtime sequence to understand: + +1. Kernel calls `lookup(name, u, v, du, dv)`. +2. If the texture record is unknown, `MissingTexture` is inserted in the request + queue and failure is marked. +3. If the texture exists but one or more tiles are missing, each missing tile is + inserted as `MissingTile` and failure is marked. +4. Kernel keeps running other pixels so the pass collects as many unique requests + as possible. +5. Host calls `sync_from_managed()` and checks retry conditions. +6. If queue overflow happened, `needs_retry()` grows the queue, clears it, and the + host relaunches immediately to recollect requests at larger capacity. +7. Otherwise host walks deduplicated requests via `process_requests(...)`. +8. `TextureLoader` resolves texture metadata and loads tile payloads into + `TextureMap`, `TileIndexMap`, and `TilePool`. +9. Host calls `sync_to_managed()` and launches the next pass. +10. When no failures remain, output is written. + +### Retry state machine + +```mermaid +stateDiagram-v2 + [*] --> Launch + Launch --> CollectRequests: kernel lookup misses + Launch --> Converged: no failures + CollectRequests --> GrowAndRelaunch: queue overflow + CollectRequests --> ProcessOnHost: queue has requests + GrowAndRelaunch --> Launch + ProcessOnHost --> SyncToManaged + SyncToManaged --> Launch + Converged --> WriteOutput + WriteOutput --> [*] +``` + +# Build and run + +The local test runner is `run.py`, which configures and builds this folder with +CMake and then executes `texture-device`. + +Useful commands from repository root: + +1. `ctest --test-dir build -R texture-device --output-on-failure` +2. `cd testsuite/texture-device/build && cmake --build . --target texture-device texture-device-devicecheck` +3. `cd testsuite/texture-device/build && ./texture-device-devicecheck && ./texture-device` + +Expected outputs include: + +- out.exr +- out.txt + +There is also a second executable, texture-device-devicecheck, used as a compile/link +gate for code intended to remain device-safe and header-centric. + +# Current scope and limitations + +- Prototype-only implementation intended for design validation. +- Scalar filtering path; not a full production texture cache. +- Retry loop is explicit and host-driven. +- Prioritizes clarity and portability over optimization. + +# Code map + +If you are reading the code for the first time, this order is a good path: + +1. host.cpp: sets up the manager side, launches the kernel, drives retry, writes + output. +2. blend.h and blend.cpp: kernel entry point and blend operation structure. +3. texture_device_decl.h and texture_device_impl.h: core DTextureSystem API and + lookup implementation. +4. texture_loader.h and texture_loader.cpp: host-side request fulfillment. +5. closed_hashmap.h and stream.h: core containers used by DTextureSystem. +6. filtering_decl.h and filtering_impl.h: EWA footprint computation and sample + generation pipeline. +7. vector_lite.h: fixed-capacity dynamic-size container used by the filtering + path. +8. arena.h and tagged_ptr.h: memory and pointer-context safety infrastructure. +9. device_tests.cpp: focused self-tests for map, request, and filtering behavior. + +This progression follows the runtime flow from orchestration to kernel lookup and +then to low-level data structures. \ No newline at end of file diff --git a/testsuite/texture-device/SPEC.md b/testsuite/texture-device/SPEC.md new file mode 100644 index 0000000000..ff747d8232 --- /dev/null +++ b/testsuite/texture-device/SPEC.md @@ -0,0 +1,334 @@ +# GPU Texture System + +Create a new test to prototype a new C++ texture system that can work on a GPU. +We will call this GPU "the device", and the only requirement from it is that it can +run a kernel written in C++. The test will create an executable following the +pattern of the few existing tests that already do that. + +The purpose of the test is to simulate execution of a texture system on the device. +For this we will write a class that models the device interface as seen from the +CPU. For now it will just be a fake GPU. Here's an example: +```cpp +class MockDevice { + using Kernel = void (*)(int x, int y, tagged_ptr data); + + tagged_ptr alloc(size_t bytes, const char* purpose); + void free(tagged_ptr p); + void copy_to(tagged_ptr device, tagged_ptr host, + size_t bytes); + void copy_from(tagged_ptr host, tagged_ptr device, + size_t bytes); + void run(int width, int height, Kernel kernel, tagged_ptr data); +}; +``` + +The MockDevice class is minimal. We are going to assume that what you allocate in the +device cannot be accessed by the host and vice versa. But truly it is just a mock-up, +`copy_to` and `copy_from` will just be `memcpy`. But we write the rest of the code as +if these pointers were real host, GPU. + + +The test is split between host orchestration (`host.cpp`) and the kernel source +(`blend.cpp`). Both are compiled together and linked to OIIO for the main test. +There is also a sanity-check binary where the kernel/device-side code must +compile itself alone to an executable, even if it is useless. This separate binary: + * Can include any OIIO header. BUT ... + * It must **not** link against OIIO. + +And that is our way of ensuring the kernel with the device texture system can +actually run on a GPU. Do this as a separate build target in the `CMakeLists.txt` so +cmake itself enforces it. + + +## Test Exercise + +The idea of this test is to do a simple operation. A kernel will access two on-disk +textures via file name plus u, v coordinates (with derivatives). It will do a blend +of the two where one uses pixel size derivatives and the other uses something 4 +times bigger so we see a blur. Then the output image will be written to disk by the +host side using OIIO. + +## Purpose of the test + +Gather all the functionality we need from OIIO to prototype the texture system so +we can then publish it in OIIO headers one thing at a time. In the first stage, if +something is not yet exposed to be inlined, we will copy&paste it into the test +code. And when we are happy with the result we can start moving code to the official +headers inside OIIO. + +## General Guidelines +### OIIO Style + +The common OpenImageIO coding style should be followed and a few existing headers +and sources should be explored. We are going to be borrowing code for texture +filtering and mip mapping, so this should be located first. Implement the full +EWA/anisotropic footprint pipeline (ellipse_axes, anisotropic_aspect, +compute_miplevels, compute_ellipse_sampling) as in `texturesys.cpp`. + +### Start Simple but with Full Filtering + +This prototype is a proof of concept and we must use simple approaches except +for mip mapping. The footprint computation (ellipse_axes, anisotropic_aspect, +compute_miplevels, compute_ellipse_sampling) should be close to the original +but keep the inner sampling to bilinear-only (no bicubic, no stochastic). + +This is the hardest part of the effort. The logic needs to be decoupled from `ImageCache` +and extracted without SIMD to headers in the test for inlining. You may need +several logical headers to organize the code. + +### Prepare for Inlining + +All tooling code that has to run on the device must be inlined. For later +incorporation in OpenImageIO we will keep it in headers inside the test. If the +implementation is more than a few lines we will split headers in blah_decl.h and +blah_impl.h. + +### SIMD vs scalar on device + +OIIO's existing texture filtering uses `simd::vfloat4` internally (with scalar fallbacks +when `OIIO_SIMD` is 0). Use only scalar logic and ignore SIMD. + + +# Design + +The device texture system uses a launch, fail and retry cycle. This means we try +to run the kernel and if something is missing execution is aborted so the host can +load what the kernel needs. The following data structures are needed: + * A device-side map from ustringhash to texture descriptor. This descriptor holds + data and display window, as well as mip level ranges. It will also keep a + pointer to the following ... + * Another per-texture map from tile coordinates (mip level, x and y) to the + pixels of the tile, that is float RGBA colors. + * A request queue. Requests can be missing textures (file name ustringhash) or + missing tiles (ustringhash plus tile coordinates). + +All these maps are pointerless hash tables that work on the device. Defined as a +template on the Key and Value, they are always created, filled and resized on the +host. Then copied in just one block to the device. The device kernel can query +them but not modify or grow. + +We will use a closed hash (open addressing) with linear probing. OIIO has `hash.h` +with `farmhash` but no pointerless hash table template. Write the closed-hash table +from scratch but using similar patterns. If possible reuse the same hash functions. + +In the current prototype, the request queue is implemented as a closed hash map +(`Request -> bool`) with dedup semantics. This keeps request insertion idempotent +across many pixels and avoids repeated host work. The queue can overflow when full; +on host side this triggers a grow+retry path (`needs_retry()`), which clears the +queue and re-runs the launch to recollect missing requests at the new capacity. + +Note all pixels in the kernel run are executed even if one fails. We want to collect +as many requests as possible. Concurrent access to the queue will be based on atomic +integers. It is ok to use the CPU ones for now, but we can use a +`DTextureSystem::atomic_inc()` method to abstract it. + + +## The tile hash map + +Tiles are big, so it wouldn't be efficient to store them in a closed hash table. +Instead we will store them in a sidecar vector ordered as they come. Then the hash +map will just store integer offsets into this table. + +## Example execution + +This is the workflow that the test will follow, starting with completely empty +data structures: + 1. Launches the kernel + 2. Kernel tries to read a texture by ustringhash, u, v, etc... + 3. Texture is missing. File a request, give up on this pixel. + 4. Host finds the kernel failed, handle requests and update tables on device. + 5. Repeat + +The host fetches the request queue after a kernel run (step 4). A failure exists if +it is not empty. + +On the second run textures may be found but tiles will be missing so more requests +will be filed for the host to satisfy until the kernel runs to completion. When +the kernel looks up a known texture, it computes the footprint of the filter, and from +that it decides which tiles it needs from what mip levels. Whatever is missing it's +requested and otherwise the lookup is computed and returned. + +## Main APIs + +The texture system lives both on the host and on device. It is a templated C++ class +with a managed (device) specialization and manager (host) specialization. It keeps: + * The hash map of texture descriptors + * A hash map from tile coordinates to tile-pool indices + * A paged tile pool (sidecar stream) storing tile pixel payloads + * The request queue + +All of these have identical copies on the device side. It is structured like this: +```cpp +template +class DTextureSystem { + // Device-side lookup path + RGBA lookup(ustringhash, float u, float v, Vec2 du, Vec2 dv); + + // Host-side orchestration path (when IsManager=true) + bool needs_retry(); + template bool process_requests(Func&&); + void sync_to_managed(); + void sync_from_managed(); + + // Resident state + ClosedHashMap textures; + ClosedHashMap tile_index; + Stream tile_pool; + ClosedHashMap requests; +}; + +class TextureLoader { + bool process_request(const Request&, DTextureSystem*); +}; + +template +struct vector_lite : public std::array { + // Fixed-capacity storage with dynamic active size. + unsigned size() const; + void push_back(const T&); +}; + +struct RequestHash { + size_t operator()(const Request&) const; + static uint64_t hash_mix_u64(uint64_t h, uint64_t v); +}; + +struct MipSelection { + unsigned mip_levels[2]; + float mip_blend; +}; +``` + +The key idea is that `DTextureSystem` is device data and code with host-manager +specialization support. The implementation is split in declaration/implementation +headers for inlining (`texture_device_decl.h` + `texture_device_impl.h`), and host +texture resolution/payload loading is delegated to `TextureLoader`. + +Filtering sample generation uses a return-by-value fixed-capacity container +(`vector_lite`) so we can keep stack-local ownership while +still tracking runtime sample count. Tile-loading and accumulation loops consume +the dynamic active size instead of always iterating the full capacity. + +OIIO's tile size defaults to 64. Use 64x64 as the hardcoded tile size and assume +RGBA shading values (accept RGB or RGBA input textures; use alpha=1 for RGB). The buffers for the tiles are grown and copied again +when new data is added. This is inefficient but good enough for a first draft. +Also we are using `RGBA` and `Vec2`. These should be: + - `RGBA` → `Imath::C4f` (`.r`, `.g`, `.b`, `.a`) + - `Vec2` → `Imath::V2f` (already available via OIIO's Imath dependency). + + +## Example Kernel + +The device runs the kernel on a grid of pixels with a tagged data pointer. +```cpp +void blend_kernel(int x, int y, tagged_ptr data){ + tagged_ptr op(data); + float resx = float(op->width), resy = float(op->height); + float u = x / resx, v = y / resy; + Vec2 duA = { 1 / resx, 0 }, dvA = { 0, 1 / resy }; + Vec2 duB = 4 * duA, dvB = 4 * dvA; // Make it blurry + RGBA A = op->texture_system.lookup(op->name_a, u, v, duA, dvA); + RGBA B = op->texture_system.lookup(op->name_b, u, v, duB, dvB); + // We let both lookups run to file as many requests as possible, but now we can + // early exit. + if (op->texture_system.failures()) + return; + op->output_buffer[y * op->width + x] = 0.5 * A + 0.5 * B; +} +``` + +The 'MockDevice::run()' method just runs this kernel for all pixels using an OIIO +parallel loop to check that we don't run into race conditions. This method +implementation can live in host.cpp for the mock-up, only the kernel lives in +blend.cpp. + +# Steps + +This is a suggested rough plan + 1. Create basic test files, cmake and cpp. + 2. Write an implementation of a pointerless hash table. + 3. Write unit tests within the existing test that writes, uploads and queries + the hash. Make sure these unit tests pass. + 4. Write and test the launch, fail, repeat cycle. + 5. Design the request queue implementation and also unit test it. + 6. Gather the filtering functionality that we need for the kernel. + 7. Finally write the blending test. + +`texture-device` should be added to the `oiio_add_tests()` list in +`src/cmake/testing.cmake` so it's discovered by CTest. This is part of step 1 +(creating basic test files). The mentioned unit tests should be assert-based self-tests +within the main test executable that run before the blend kernel, exiting non-zero +on failure. + +For the blend test use the existing `testsuite/common/textures/grid.tx` and +`checker.tx` (pre-MIP-mapped, already in repo). And we read everything using OIIO +`ImageInput`. Convert to 3 channel if needed and do not clean up any generated files +(output images, build directories) automatically, the expectation is that `run.py` +leaves artifacts behind for debugging. Also use exr/tif output and image comparison +via `diff_command` in `run.py` for the blend test validation. + +# Codebase Analysis + +## Existing test patterns + +Only 2 test suites produce their own executables: +- `testsuite/cmake-consumer/` -- standalone `CMakeLists.txt`, builds `consumer` target + linked to OIIO, run via `run.py` +- `testsuite/docs-examples-cpp/` -- standalone `CMakeLists.txt`, builds 8 executables + (`docs-examples-texturesys`, etc.), all linked to both OIIO and Imath + +All other 142+ tests are `runtest.py`-based, using pre-built OIIO tools (`testtex`, +`oiiotool`, `maketx`). + +The `cmake-consumer` pattern is closest to what we need: a `CMakeLists.txt` that builds +executables and a `run.py` that invokes cmake + runs them. Tests are registered in +`src/cmake/testing.cmake` via `oiio_add_tests()`. + +No existing test uses a `host.cpp` / `blend.cpp` split or has one target linked to OIIO +and another not. + +## Key OIIO headers and types for device-side code + +**GPU-safe headers already in OIIO:** +- `OpenImageIO/ustring.h` -- `ustringhash` is header-only, stores a single `uint64_t`, + fully `constexpr`/device-safe +- `OpenImageIO/hash.h` -- `farmhash::inlined::Hash64` is `OIIO_HOSTDEVICE constexpr` +- `OpenImageIO/fmath.h` -- `fast_sincos`, `fast_atan2`, `safe_sqrt`, `madd`, `clamp`, + `fast_exp`, etc. all `OIIO_HOSTDEVICE` +- `OpenImageIO/platform.h` -- `OIIO_HOSTDEVICE`, `OIIO_DEVICE_CONSTEXPR` macros +- `OpenImageIO/simd.h` -- `vfloat4`, `vint4`, etc. have scalar fallbacks when `OIIO_SIMD` is 0 +- `OpenImageIO/texture.h` -- `TextureOpt`, `MipMode`, `InterpMode`, `Wrap` enums (no GPU + annotations, but pure data) + +**NOT GPU-safe:** +- `ustring` (not `ustringhash`) -- uses global hash table with mutex/malloc +- SIMD types with `OIIO_SIMD > 0` -- use SSE/AVX intrinsics + +**Types:** +- Vec2/Vec3: OIIO wraps Imath (`Imath::V2f`, `Imath::V3f`) via `OpenImageIO/Imath.h` +- RGBA color: `Imath::C4f` (`.r`, `.g`, `.b`, `.a`) or raw `float[4]` +- OIIO internally uses `simd::vfloat3` (padded to 4) for color accumulation + +## Texture filtering code location + +All in `src/libtexture/texturesys.cpp`: +- `ellipse_axes()` -- EWA footprint computation (pure math, ~40 lines) +- `anisotropic_aspect()` -- anisotropy clamping (in `texture_pvt.h`, ~35 lines) +- `adjust_blur()` -- sblur/tblur application +- `compute_miplevels()` -- mip level selection with blend weights (~80 lines) +- `compute_ellipse_sampling()` -- sample placement along major axis (~55 lines) +- `st_to_texel()` -- coordinate to texel mapping (in `texture_pvt.h`, ~15 lines) +- `sample_bilinear()` -- bilinear interpolation within a mip level (depends heavily on + tile cache, ~200+ lines) +- `texture_lookup()` -- full EWA anisotropic lookup, orchestrates all of the above + (~230 lines) + +The tile cache layer (`ImageCacheImpl`, `ImageCacheFile`, `ImageCacheTile`) is the biggest +piece that needs adaptation for GPU. + +## Source textures available in repo + +- `testsuite/common/textures/grid.tx` -- pre-MIP-mapped grid, used by 25+ existing tests +- `testsuite/common/textures/checker.tx` -- pre-MIP-mapped checkerboard, used by `texture-blurtube` +- `testsuite/common/grid.tif` -- 1000x1000 plain TIFF grid, can be `maketx`'d at runtime +- `testsuite/common/checker_with_alpha.exr` -- checkerboard with alpha diff --git a/testsuite/texture-device/arena.cpp b/testsuite/texture-device/arena.cpp new file mode 100644 index 0000000000..91392b7425 --- /dev/null +++ b/testsuite/texture-device/arena.cpp @@ -0,0 +1,313 @@ +// Copyright Contributors to the OpenImageIO project. +// SPDX-License-Identifier: Apache-2.0 +// https://github.com/AcademySoftwareFoundation/OpenImageIO + +#include "arena.h" + +#include +#include +#include + +namespace texture_device { + +uint64_t g_tagged_ptr_context = ptrtag("Host"); + +namespace { + + template + void report_leaks(const char* owner, const AllocationMap& allocated) + { + if (allocated.empty()) + return; + + std::fprintf(stderr, + "texture-device: %s leak check failed (%zu allocations)\n", + owner, allocated.size()); + for (const auto& it : allocated) { + const auto& rec = it.second; + std::fprintf(stderr, " leak ptr=%p bytes=%zu purpose=%s\n", + it.first, rec.bytes, + rec.purpose ? rec.purpose : "(null purpose)"); + } + std::abort(); + } + + template + void tracked_free(AllocationMap& allocated, tagged_ptr p, + const char* owner) + { + if (!p) + return; + + auto it = allocated.find(p.get()); + if (it == allocated.end()) { + std::fprintf( + stderr, + "texture-device: invalid free ptr=%p (not allocated by %s::alloc)\n", + p.get(), owner); + std::abort(); + } + allocated.erase(it); + std::free(p.get()); + } + +} // namespace + +Host::~Host() { report_leaks("Host", m_allocated); } + +tagged_ptr +Host::alloc(size_t bytes, const char* purpose) +{ + void* p = std::malloc(bytes); + if (!p) + return nullptr; + + m_allocated[p] = AllocationRecord { bytes, purpose }; + return { p, "Host" }; +} + +void +Host::free(tagged_ptr p) +{ + tracked_free(m_allocated, p, "Host"); +} + +void +Host::copy_to(tagged_ptr device, tagged_ptr host, + size_t bytes) +{ + if (!device || !host) + return; + + // copy_to expects a device destination and host source. + if (!device.is("MockDevice") || host.is("MockDevice")) + std::abort(); + + std::memcpy(device.get(), host.get(), bytes); +} + +void +Host::copy_from(tagged_ptr host, tagged_ptr device, + size_t bytes) +{ + if (!host || !device) + return; + + // copy_from expects a host destination and device source. + if (host.is("MockDevice") || !device.is("MockDevice")) + std::abort(); + + std::memcpy(host.get(), device.get(), bytes); +} + +void +Host::copy_in(tagged_ptr to, tagged_ptr from, size_t bytes) +{ + if (!to || !from) + return; + + // copy_in expects both pointers to belong to Host. + if (!to.is("Host") || !from.is("Host")) + std::abort(); + + std::memcpy(to.get(), from.get(), bytes); +} + +MockDevice::~MockDevice() { report_leaks("MockDevice", m_allocated); } + +tagged_ptr +MockDevice::alloc(size_t bytes, const char* purpose) +{ + void* p = std::malloc(bytes); + if (!p) + return nullptr; + + m_allocated[p] = AllocationRecord { bytes, purpose }; + return { p, "MockDevice" }; +} + +void +MockDevice::free(tagged_ptr p) +{ + tracked_free(m_allocated, p, "MockDevice"); +} + +void +MockDevice::copy_to(tagged_ptr device, tagged_ptr host, + size_t bytes) +{ + if (!device || !host) + return; + + // copy_to expects a device destination and host source. + if (!device.is("MockDevice") || host.is("MockDevice")) + std::abort(); + + std::memcpy(device.get(), host.get(), bytes); +} + +void +MockDevice::copy_from(tagged_ptr host, tagged_ptr device, + size_t bytes) +{ + if (!host || !device) + return; + + // copy_from expects a host destination and device source. + if (host.is("MockDevice") || !device.is("MockDevice")) + std::abort(); + + std::memcpy(host.get(), device.get(), bytes); +} + +void +MockDevice::copy_in(tagged_ptr to, tagged_ptr from, + size_t bytes) +{ + if (!to || !from) + return; + + // copy_in expects both pointers to belong to MockDevice. + if (!to.is("MockDevice") || !from.is("MockDevice")) + std::abort(); + + std::memcpy(to.get(), from.get(), bytes); +} + +void +MockDevice::run(int width, int height, Kernel kernel, tagged_ptr data) +{ + struct MockDeviceExecutionGuard { + MockDeviceExecutionGuard() + { + g_tagged_ptr_context = ptrtag("MockDevice"); + } + ~MockDeviceExecutionGuard() { g_tagged_ptr_context = ptrtag("Host"); } + } guard; + + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) + kernel(x, y, data); + } +} + +#if TEXTURE_DEVICE_HAS_CUDA_RUNTIME + +CudaArena::~CudaArena() +{ +# if TEXTURE_DEVICE_ENABLE_CUDA_SKETCH_IMPL + // FIXME: this frees tracked allocations but does not clear m_allocated, + // so report_leaks below will still abort if entries remain. + for (const auto& it : m_allocated) { + cudaFree(it.first); + } +# endif + report_leaks("CudaArena", m_allocated); +} + +tagged_ptr +CudaArena::alloc(size_t bytes, const char* purpose) +{ +# if TEXTURE_DEVICE_ENABLE_CUDA_SKETCH_IMPL + void* p = nullptr; + cudaError_t err = cudaMalloc(&p, bytes); + if (err != cudaSuccess) + return nullptr; + m_allocated[p] = AllocationRecord { bytes, purpose }; + return { p, "CudaDevice" }; +# else + (void)bytes; + (void)purpose; + return nullptr; +# endif +} + +void +CudaArena::free(tagged_ptr p) +{ +# if TEXTURE_DEVICE_ENABLE_CUDA_SKETCH_IMPL + if (!p) + return; + auto it = m_allocated.find(p.get()); + if (it == m_allocated.end()) { + std::fprintf( + stderr, + "texture-device: invalid free ptr=%p (not allocated by CudaArena::alloc)\n", + p.get()); + std::abort(); + } + // FIXME: check cudaFree return and surface failures. + cudaFree(p.get()); + m_allocated.erase(it); +# else + (void)p; +# endif +} + +void +CudaArena::copy_to(tagged_ptr device, tagged_ptr host, + size_t bytes) +{ +# if TEXTURE_DEVICE_ENABLE_CUDA_SKETCH_IMPL + if (!device || !host) + return; + // FIXME: check cudaMemcpyAsync return and decide synchronization policy. + cudaMemcpyAsync(device.get(), host.get(), bytes, cudaMemcpyHostToDevice, + m_stream); +# else + (void)device; + (void)host; + (void)bytes; +# endif +} + +void +CudaArena::copy_from(tagged_ptr host, tagged_ptr device, + size_t bytes) +{ +# if TEXTURE_DEVICE_ENABLE_CUDA_SKETCH_IMPL + if (!host || !device) + return; + // FIXME: check cudaMemcpyAsync return and decide synchronization policy. + cudaMemcpyAsync(host.get(), device.get(), bytes, cudaMemcpyDeviceToHost, + m_stream); +# else + (void)host; + (void)device; + (void)bytes; +# endif +} + +void +CudaArena::copy_in(tagged_ptr to, tagged_ptr from, + size_t bytes) +{ +# if TEXTURE_DEVICE_ENABLE_CUDA_SKETCH_IMPL + if (!to || !from) + return; + // FIXME: check cudaMemcpyAsync return and decide synchronization policy. + cudaMemcpyAsync(to.get(), from.get(), bytes, cudaMemcpyDeviceToDevice, + m_stream); +# else + (void)to; + (void)from; + (void)bytes; +# endif +} + +void +CudaArena::run(int width, int height, Kernel kernel, tagged_ptr data) +{ +# if TEXTURE_DEVICE_ENABLE_CUDA_SKETCH_IMPL + // The current Kernel signature is host-callable. A real CUDA path needs a + // separate __global__ entrypoint and launch configuration. +# endif + (void)width; + (void)height; + (void)kernel; + (void)data; +} + +#endif + +} // namespace texture_device diff --git a/testsuite/texture-device/arena.h b/testsuite/texture-device/arena.h new file mode 100644 index 0000000000..635edfbc86 --- /dev/null +++ b/testsuite/texture-device/arena.h @@ -0,0 +1,189 @@ +// Copyright Contributors to the OpenImageIO project. +// SPDX-License-Identifier: Apache-2.0 +// https://github.com/AcademySoftwareFoundation/OpenImageIO + +#pragma once + +#include +#include +#include +#include + +#include "tagged_ptr.h" + +#if defined(__has_include) +# if __has_include() +# include +# define TEXTURE_DEVICE_HAS_CUDA_RUNTIME 1 +# endif +# if __has_include() +# include +# endif +#endif + +#ifndef TEXTURE_DEVICE_HAS_CUDA_RUNTIME +# define TEXTURE_DEVICE_HAS_CUDA_RUNTIME 0 +#endif + +namespace texture_device { + +class NullArena { +public: + using Kernel = void (*)(int x, int y, tagged_ptr data); + + // Methods an arena should implement + tagged_ptr alloc(size_t bytes, const char* purpose); + void free(tagged_ptr p); + void copy_to(tagged_ptr device, tagged_ptr host, + size_t bytes); + void copy_from(tagged_ptr host, tagged_ptr device, + size_t bytes); + void copy_in(tagged_ptr to, tagged_ptr from, + size_t bytes); + // Optionally, if we intend to run things on it + void run(int width, int height, Kernel kernel, tagged_ptr data); +}; + +class Host : public NullArena { +public: + ~Host(); + + tagged_ptr alloc(size_t bytes, const char* purpose); + void free(tagged_ptr p); + void copy_to(tagged_ptr device, tagged_ptr host, + size_t bytes); + void copy_from(tagged_ptr host, tagged_ptr device, + size_t bytes); + void copy_in(tagged_ptr to, tagged_ptr from, + size_t bytes); + template struct Atomic : public std::atomic { + using std::atomic::atomic; + using std::atomic::operator=; + + T load() const + { + return std::atomic::load(std::memory_order_acquire); + } + void store(const T& v) + { + std::atomic::store(v, std::memory_order_release); + } + bool cas(T& expected, const T& desired) + { + return std::atomic::compare_exchange_strong( + expected, desired, std::memory_order_acq_rel, + std::memory_order_acquire); + } + T fetch_add(const T& v) + { + return std::atomic::fetch_add(v, std::memory_order_acq_rel); + } + }; + +private: + struct AllocationRecord { + size_t bytes = 0; + const char* purpose = nullptr; + }; + + // Record every alloc pointer with its purpose to validate frees and leaks. + std::unordered_map m_allocated; +}; + +class MockDevice : public NullArena { +public: + template using Atomic = Host::Atomic; + using NullArena::Kernel; + + ~MockDevice(); + + tagged_ptr alloc(size_t bytes, const char* purpose); + void free(tagged_ptr p); + void copy_to(tagged_ptr device, tagged_ptr host, + size_t bytes); + void copy_from(tagged_ptr host, tagged_ptr device, + size_t bytes); + void copy_in(tagged_ptr to, tagged_ptr from, + size_t bytes); + void run(int width, int height, Kernel kernel, tagged_ptr data); + +private: + struct AllocationRecord { + size_t bytes = 0; + const char* purpose = nullptr; + }; + + // Record every alloc pointer with its purpose to validate frees and leaks. + std::unordered_map m_allocated; +}; + +#if TEXTURE_DEVICE_HAS_CUDA_RUNTIME + +// Guidance-only sketch for a real device arena backed by CUDA runtime APIs. +// This is intentionally not wired into the current test flow. +class CudaArena : public NullArena { +public: + template +# ifdef __CUDA_ARCH__ + struct Atomic : public cuda::atomic { + using cuda::atomic::atomic; + using cuda::atomic::operator=; + + T load() const + { + return cuda::atomic::load(cuda::memory_order_acquire); + } + void store(const T& v) + { + cuda::atomic::store(v, cuda::memory_order_release); + } + bool cas(T& expected, const T& desired) + { + return cuda::atomic::compare_exchange_strong( + expected, desired, cuda::memory_order_acq_rel, + cuda::memory_order_acquire); + } + T fetch_add(const T& v) + { + return cuda::atomic::fetch_add(v, cuda::memory_order_acq_rel); + } + }; +# else + using Atomic = Host::Atomic; +# endif + using NullArena::Kernel; + + CudaArena() = default; + explicit CudaArena(cudaStream_t stream) + : m_stream(stream) + { + } + + ~CudaArena(); + + tagged_ptr alloc(size_t bytes, const char* purpose); + void free(tagged_ptr p); + void copy_to(tagged_ptr device, tagged_ptr host, + size_t bytes); + void copy_from(tagged_ptr host, tagged_ptr device, + size_t bytes); + void copy_in(tagged_ptr to, tagged_ptr from, + size_t bytes); + void run(int width, int height, Kernel kernel, tagged_ptr data); + + void set_stream(cudaStream_t stream) { m_stream = stream; } + cudaStream_t stream() const { return m_stream; } + +private: + struct AllocationRecord { + size_t bytes = 0; + const char* purpose = nullptr; + }; + + cudaStream_t m_stream = nullptr; + std::unordered_map m_allocated; +}; + +#endif + +} // namespace texture_device diff --git a/testsuite/texture-device/blend.cpp b/testsuite/texture-device/blend.cpp new file mode 100644 index 0000000000..4c55060371 --- /dev/null +++ b/testsuite/texture-device/blend.cpp @@ -0,0 +1,36 @@ +// Copyright Contributors to the OpenImageIO project. +// SPDX-License-Identifier: Apache-2.0 +// https://github.com/AcademySoftwareFoundation/OpenImageIO + +#include "blend.h" +#include "filtering_impl.h" +#include "texture_device_impl.h" + +namespace texture_device { + +void +blend_kernel(int x, int y, tagged_ptr data) +{ + tagged_ptr op(data); + const float resx = static_cast(op->width); + const float resy = static_cast(op->height); + const float invx = 1.0f / resx; + const float invy = 1.0f / resy; + const float u = (static_cast(x) + 0.5f) * invx; + const float v = (static_cast(y) + 0.5f) * invy; + + Vec2 duA { invx, 0.0f }; + Vec2 dvA { 0.0f, invy }; + Vec2 duB = 4.0f * duA; + Vec2 dvB = 4.0f * dvA; + + RGBA A = op->texture_system.lookup(op->name_a, u, v, duA, dvA); + RGBA B = op->texture_system.lookup(op->name_b, u, v, duB, dvB); + + if (op->texture_system.failures()) + return; + + op->output_buffer[y * op->width + x] = 0.5f * A + 0.5f * B; +} + +} // namespace texture_device diff --git a/testsuite/texture-device/blend.h b/testsuite/texture-device/blend.h new file mode 100644 index 0000000000..4804d4886e --- /dev/null +++ b/testsuite/texture-device/blend.h @@ -0,0 +1,23 @@ +// Copyright Contributors to the OpenImageIO project. +// SPDX-License-Identifier: Apache-2.0 +// https://github.com/AcademySoftwareFoundation/OpenImageIO + +#pragma once + +#include "texture_device_decl.h" + +namespace texture_device { + +struct BlendOp { + int width = 0; + int height = 0; + OIIO::ustringhash name_a; + OIIO::ustringhash name_b; + DTextureSystem texture_system; + tagged_ptr output_buffer = nullptr; +}; + +void +blend_kernel(int x, int y, tagged_ptr data); + +} // namespace texture_device diff --git a/testsuite/texture-device/closed_hashmap.h b/testsuite/texture-device/closed_hashmap.h new file mode 100644 index 0000000000..9fd21f5540 --- /dev/null +++ b/testsuite/texture-device/closed_hashmap.h @@ -0,0 +1,357 @@ +// Copyright Contributors to the OpenImageIO project. +// SPDX-License-Identifier: Apache-2.0 +// https://github.com/AcademySoftwareFoundation/OpenImageIO + +#pragma once + +#include +#include +#include +#include +#include + +#include "arena.h" + +namespace texture_device { + +#define OPT_FUNCT(condition, return_type) \ + template std::enable_if_t + +#define OPT_CONSTRUCT(condition) \ + template> + +#define OPT_FIELD(condition, field_type) \ + std::conditional_t + +template +class ClosedHashMap { + template using Atomic = typename Arena::template Atomic; + static constexpr bool IsManager + = !std::is_same::value; + struct Slot; + + +public: + template + friend class ClosedHashMap; + + using Managed = ClosedHashMap; + + // Managed only functionality + + OPT_CONSTRUCT(!IsManager) + ClosedHashMap(Arena& arena, uint32_t capacity) + : m_arena(&arena) + , m_capacity(capacity) + , m_size(0) + , m_overflowed(false) + , m_owner(true) + { + m_slots = m_arena->alloc(sizeof(Slot) * capacity, + "ClosedHashMap::ClosedHashMap"); + clear(); + } + OPT_CONSTRUCT(!IsManager) + ClosedHashMap(const ClosedHashMap& o) + { + m_arena = o.m_arena; + m_slots = o.m_slots; + m_capacity = o.m_capacity; + m_size = o.m_size; + m_overflowed = o.m_overflowed; + m_owner = false; + } + OPT_CONSTRUCT(!IsManager) + ClosedHashMap() + : m_arena(nullptr) + , m_slots(nullptr) + , m_capacity(0) + , m_size(0) + , m_overflowed(false) + , m_owner(false) + { + } + + OPT_FUNCT(!IsManager, const ClosedHashMap&) + operator=(const ClosedHashMap & o) + { + m_arena = o.m_arena; + m_slots = o.m_slots; + m_capacity = o.m_capacity; + m_size = o.m_size.load(); + m_overflowed = o.m_overflowed.load(); + m_owner = false; + return *this; + } + + // Manager only functionality + + OPT_CONSTRUCT(IsManager) + ClosedHashMap(Arena& arena, ManagedArena& marena, size_t capacity, + Managed& managed) + : m_arena(&arena) + , m_capacity(capacity) + , m_size(0) + , m_overflowed(false) + , m_owner(true) + , m_managed(managed) + { + // Do not assign from a temporary Managed(marena, capacity) here. + // Managed-side operator= intentionally sets m_owner=false (view + // semantics), and a temporary would free the allocated slots on + // destruction, leaving dangling pointers in m_managed. + m_managed.m_arena = &marena; + m_managed.m_capacity = m_capacity; + m_managed.m_size = m_size.load(); + m_managed.m_overflowed = m_overflowed.load(); + m_managed.m_owner = true; + m_managed.m_slots + = m_managed.m_arena->alloc(sizeof(Slot) * capacity, + "ClosedHashMap::ClosedHashMap"); + m_slots = m_arena->alloc(sizeof(Slot) * capacity, + "ClosedHashMap::ClosedHashMap"); + + // Initialize manager-side slot states. Managed side is synchronized + // from manager before use. + clear(); + } + + OPT_FUNCT(IsManager, bool) + grow() { return resize(std::max(8u, m_capacity * 2u)); } + + OPT_FUNCT(IsManager, void) + sync_to_managed() + { + m_managed.m_arena->copy_to(m_managed.m_slots, m_slots, + sizeof(Slot) * m_capacity); + m_managed.m_size = m_size.load(); + m_managed.m_overflowed = m_overflowed.load(); + } + OPT_FUNCT(IsManager, void) + sync_from_managed() + { + m_managed.m_arena->copy_from(m_slots, m_managed.m_slots, + sizeof(Slot) * m_capacity); + m_size = m_managed.m_size.load(); + m_overflowed = m_managed.m_overflowed.load(); + } + + // Both manager and managed + + ~ClosedHashMap() + { + if (m_owner) + m_arena->free(m_slots); + } + + bool insert(const Key& key, const Value& value) + { + if (!m_slots || m_capacity == 0) + return false; + + if constexpr (IsManager) { + const uint32_t used = m_size.load(); + if ((uint64_t(used + 1u) * 8u) >= (uint64_t(m_capacity) * 7u)) { + if (!grow()) { + m_overflowed.store(true); + return false; + } + } + } + + const uint64_t key_state = slot_state(key); + const uint32_t idx = probe_start(key); + for (uint32_t i = 0; i < m_capacity; ++i) { + Slot& slot = m_slots[(idx + i) % m_capacity]; + + // A reserved slot is being written by another thread; spin until + // that writer publishes the final state. + uint64_t observed = slot.state.load(); + while (observed == kReservedState) + observed = slot.state.load(); + + if (observed == key_state && slot.key == key) { + slot.value = value; + return true; + } + + if (observed == kEmptyState) { + uint64_t expected = kEmptyState; + // Reserve this slot first so key/value writes are not raced by + // another inserter probing the same position. + if (!slot.state.cas(expected, kReservedState)) + continue; + + slot.key = key; + slot.value = value; + slot.state.store(key_state); + m_size.fetch_add(1u); + return true; + } + } + + m_overflowed.store(true); + return false; + } + + bool find(const Key& key, Value& value) const + { + if (!m_slots || m_capacity == 0) + return false; + + const uint64_t key_state = slot_state(key); + const uint32_t idx = probe_start(key); + for (uint32_t i = 0; i < m_capacity; ++i) { + const Slot& slot = m_slots[(idx + i) % m_capacity]; + + uint64_t observed = slot.state.load(); + while (observed == kReservedState) + observed = slot.state.load(); + + if (observed == kEmptyState) + // Open addressing invariant: first empty slot means key is not + // present in this probe chain. + return false; + + if (observed == key_state && slot.key == key) { + value = slot.value; + return true; + } + } + return false; + } + + uint32_t capacity() const { return m_capacity; } + uint32_t size() const { return m_size.load(); } + bool overflowed() const { return m_overflowed.load(); } + bool failed() const { return size() != 0 || overflowed(); } + + void clear() + { + if (!m_slots) + return; + for (uint32_t i = 0; i < m_capacity; ++i) + m_slots[i].state.store(kEmptyState); + m_size.store(0u); + m_overflowed.store(false); + } + + class Iterator { + public: + Iterator(const Slot* slot, const Slot* end) + : m_slot(slot) + , m_end(end) + { + advance_to_occupied(); + } + + const Key& operator*() const { return m_slot->key; } + + Iterator& operator++() + { + ++m_slot; + advance_to_occupied(); + return *this; + } + + bool operator!=(const Iterator& other) const + { + return m_slot != other.m_slot; + } + + private: + void advance_to_occupied() + { + while (m_slot < m_end) { + const uint64_t state = m_slot->state.load(); + if (state != kEmptyState && state != kReservedState) + break; + ++m_slot; + } + } + + const Slot* m_slot; + const Slot* m_end; + }; + + Iterator begin() const + { + if (!m_slots) + return Iterator(nullptr, nullptr); + return Iterator(m_slots.get(), m_slots.get() + m_capacity); + } + + Iterator end() const + { + if (!m_slots) + return Iterator(nullptr, nullptr); + return Iterator(m_slots.get() + m_capacity, m_slots.get() + m_capacity); + } + +private: + struct Slot { + Key key {}; + Value value {}; + Atomic state { kEmptyState }; + }; + + + static constexpr uint64_t kEmptyState = 0; + static constexpr uint64_t kReservedState = 1; + + static uint64_t slot_state(const Key& key) + { + uint64_t s = static_cast(Hash {}(key)); + // Reserve 0/1 for sentinel states so all occupied slots use >=2. + if (s == kEmptyState || s == kReservedState) + s += 2; + return s; + } + + uint32_t probe_start(const Key& key) const + { + return static_cast(slot_state(key) % m_capacity); + } + + bool resize(uint32_t new_capacity) + { + tagged_ptr old_slots = m_slots; + m_slots = m_arena->alloc(sizeof(Slot) * new_capacity, + "ClosedHashMap::resize"); + uint32_t old_capacity = m_capacity; + m_capacity = new_capacity; + if constexpr (IsManager) { + clear(); + // Reinsert so probe positions are rebuilt for the new capacity. + for (uint32_t i = 0; i < old_capacity; ++i) { + const Slot& slot = old_slots[i]; + const uint64_t state = slot.state.load(); + if (state == kEmptyState || state == kReservedState) + continue; + insert(slot.key, slot.value); + } + m_managed.resize(new_capacity); + m_managed.m_arena->copy_to(m_managed.m_slots, m_slots, + sizeof(Slot) * m_capacity); + } // Otherwise reallocating slots is enough, manager will copy data + + m_arena->free(old_slots); + return true; + } + + Arena* m_arena; + tagged_ptr m_slots; + uint32_t m_capacity = 0; + Atomic m_size { 0 }; + Atomic m_overflowed { false }; + bool m_owner; + // Manager only + OPT_FIELD(IsManager, Managed&) m_managed; +}; + +} // namespace texture_device + +#undef OPT_FIELD +#undef OPT_CONSTRUCT +#undef OPT_FUNCT diff --git a/testsuite/texture-device/device_tests.cpp b/testsuite/texture-device/device_tests.cpp new file mode 100644 index 0000000000..04fbc2215a --- /dev/null +++ b/testsuite/texture-device/device_tests.cpp @@ -0,0 +1,186 @@ +// Copyright Contributors to the OpenImageIO project. +// SPDX-License-Identifier: Apache-2.0 +// https://github.com/AcademySoftwareFoundation/OpenImageIO + +#include "filtering_impl.h" // IWYU pragma: keep +#include "texture_device_decl.h" +#include "texture_device_impl.h" // IWYU pragma: keep + +#include + +namespace texture_device { + +template +struct DTextureSystemTestAccess { + using System = DTextureSystem; + + static auto ellipse_axes(System& system, const TextureRecord& texture, + Vec2 du, Vec2 dv) + { + (void)system; + return FilteringUnitTestsAccess::ellipse_axes(texture, du, dv); + } + + static auto compute_miplevels(System& system, const TextureRecord& texture, + const MipAnisoFilter::EllipseAxes& axes) + { + (void)system; + return FilteringUnitTestsAccess::compute_miplevels(texture, axes); + } + + static auto + compute_ellipse_sampling(System& system, const TextureRecord& texture, + const MipAnisoFilter::EllipseAxes& axes, int mip) + { + (void)system; + return FilteringUnitTestsAccess::compute_ellipse_sampling(texture, axes, + mip); + } +}; + +namespace { + + struct ConstantHash { + size_t operator()(uint64_t) const { return 1; } + }; + +} // namespace + +using UnitRequestQueue = DTextureSystem::RequestQueue; +bool +run_device_unit_tests() +{ + constexpr uint64_t kGridHash = 0x3f6b4e91u; + constexpr uint64_t kCheckerHash = 0x13a56d2bu; + constexpr uint64_t kBulkHash = 0x67d9aa11u; + + Host host; + + using Map = ClosedHashMap, Host>; + using CollisionMap = ClosedHashMap; + + Map map(host, 16); + map.clear(); + + CollisionMap collision_map(host, 8); + collision_map.clear(); + + // Basic closed-hash insert/find path. + if (!map.insert(10, 100)) + return false; + if (!map.insert(26, 260)) + return false; + + int value = 0; + if (!map.find(10, value) || value != 100) + return false; + if (!map.find(26, value) || value != 260) + return false; + // Missing key must not report a hit. + if (map.find(42, value)) + return false; + + // Force collisions into one probe chain and validate retrieval order + // independence. + if (!collision_map.insert(1, 11) || !collision_map.insert(9, 99) + || !collision_map.insert(17, 171)) + return false; + if (!collision_map.find(17, value) || value != 171) + return false; + if (!collision_map.find(9, value) || value != 99) + return false; + + UnitRequestQueue queue(host, DTextureSystem::kMaxRequests); + queue.clear(); + + Request req_a; + req_a.type = RequestType::MissingTexture; + req_a.tile.texture_hash = kGridHash; + + Request req_b; + req_b.type = RequestType::MissingTexture; + req_b.tile.texture_hash = kCheckerHash; + + // Request queue is deduplicating: repeated inserts of the same request + // should succeed but not grow queue size. + if (!queue.insert(req_a, true)) + return false; + if (!queue.insert(req_a, true)) + return false; + if (!queue.insert(req_b, true)) + return false; + + std::array::kMaxRequests> bulk; + for (uint32_t i = 0; i < DTextureSystem::kMaxRequests; ++i) { + bulk[i].type = RequestType::MissingTile; + bulk[i].tile = TileCoords { kBulkHash, uint16_t(i), uint16_t(i / 64), + uint16_t(i % 4), 0 }; + } + + UnitRequestQueue full_queue(host, DTextureSystem::kMaxRequests); + full_queue.clear(); + // Fill to capacity to exercise overflow signaling behavior. + for (uint32_t i = 0; i < DTextureSystem::kMaxRequests; ++i) { + if (!full_queue.insert(bulk[i], true)) + return false; + } + // Existing key still succeeds when table is full. + if (full_queue.insert(bulk[0], true) != true) + return false; + // New key must fail once no slot remains. + if (full_queue.insert(Request { RequestType::MissingTile, + TileCoords { kBulkHash, 12345, 0, 0, 0 } }, + true)) + return false; + if (!full_queue.overflowed()) + return false; + + { + DTextureSystem system(host); + using Access = DTextureSystemTestAccess; + TextureRecord tex; + tex.width = 1024; + tex.height = 1024; + + // Mip transition: small derivatives should stay near base level. + auto axes_lo = Access::ellipse_axes(system, tex, + Vec2 { 1.0f / 1024.0f, 0.0f }, + Vec2 { 0.0f, 1.0f / 1024.0f }); + auto mips_lo = Access::compute_miplevels(system, tex, axes_lo); + if (mips_lo.mip_levels[0] != 0) + return false; + + // Mip transition: larger derivatives should move to coarser levels. + auto axes_hi = Access::ellipse_axes(system, tex, + Vec2 { 8.0f / 1024.0f, 0.0f }, + Vec2 { 0.0f, 8.0f / 1024.0f }); + auto mips_hi = Access::compute_miplevels(system, tex, axes_hi); + if (mips_hi.mip_levels[0] <= mips_lo.mip_levels[0]) + return false; + + // Anisotropy extremes should increase sample count, capped by max. + auto axes_aniso = Access::ellipse_axes(system, tex, + Vec2 { 8.0f / 1024.0f, 0.0f }, + Vec2 { 0.0f, 1.0f / 1024.0f }); + auto ellipse = Access::compute_ellipse_sampling(system, tex, axes_aniso, + 0); + if (ellipse.nsamples < 2 + || ellipse.nsamples > MipAnisoFilter::kMaxSamples) + return false; + + // Wrap behavior regression checks. + bool in_range = true; + if (wrap_coord(-1, 8, WrapMode::Clamp, in_range) != 0 || !in_range) + return false; + if (wrap_coord(-1, 8, WrapMode::Periodic, in_range) != 7 || !in_range) + return false; + (void)wrap_coord(-1, 8, WrapMode::Black, in_range); + if (in_range) + return false; + } + + // Dedup result for req_a + req_b should keep queue size at 2. + return queue.size() == 2; +} + +} // namespace texture_device diff --git a/testsuite/texture-device/devicecheck_main.cpp b/testsuite/texture-device/devicecheck_main.cpp new file mode 100644 index 0000000000..fc5fda25d8 --- /dev/null +++ b/testsuite/texture-device/devicecheck_main.cpp @@ -0,0 +1,23 @@ +// Copyright Contributors to the OpenImageIO project. +// SPDX-License-Identifier: Apache-2.0 +// https://github.com/AcademySoftwareFoundation/OpenImageIO + +#include "blend.h" + +int +main() +{ + // Keep a direct kernel call in this binary so link-time symbol resolution + // covers the full devicecheck path. + texture_device::BlendOp op; + op.width = 1; + op.height = 1; + texture_device::blend_kernel(0, 0, + texture_device::tagged_ptr(&op, "Host")); + + // Force an observable read of kernel-mutated state to avoid DCE. + volatile bool kernel_recorded_failure = op.texture_system.failures(); + (void)kernel_recorded_failure; + + return texture_device::run_device_unit_tests() ? 0 : 1; +} diff --git a/testsuite/texture-device/filtering_decl.h b/testsuite/texture-device/filtering_decl.h new file mode 100644 index 0000000000..7f3359c96f --- /dev/null +++ b/testsuite/texture-device/filtering_decl.h @@ -0,0 +1,118 @@ +// Copyright Contributors to the OpenImageIO project. +// SPDX-License-Identifier: Apache-2.0 +// https://github.com/AcademySoftwareFoundation/OpenImageIO + +#pragma once + +#include + +#include + +#include "vector_lite.h" + +namespace texture_device { + +using Vec2 = Imath::V2f; + +struct TileCoords { + uint64_t texture_hash = 0; + uint16_t x = 0; + uint16_t y = 0; + uint16_t mip = 0; + uint16_t pad = 0; // zero padded for the hash to be deterministic + + bool operator==(const TileCoords& other) const + { + return texture_hash == other.texture_hash && x == other.x + && y == other.y && mip == other.mip; + } +}; + +struct TileRecord; +struct TextureRecord; +struct FilteringUnitTestsAccess; + +struct Filter { + struct Sample { + float weight; + TileCoords tcoords; + int local_x; + int local_y; + const TileRecord* tile; + }; +}; + +struct MipAnisoFilter : public Filter { + static constexpr uint32_t kMaxSamples = 8; + static constexpr uint32_t kMaxMipLevel = 4; + + struct MipSelection { + unsigned mip_levels[2] = { 0, 0 }; + float mip_blend = 0.0f; + }; + + struct EllipseSampling { + Vec2 axis_uv = Vec2(1.0f, 0.0f); + unsigned nsamples = 1; + float span_uv = 0.0f; + }; + + struct EllipseAxes { + Vec2 major_uv = Vec2(1.0f, 0.0f); + float major_rho = 1.0f; + float minor_rho = 1.0f; + }; + + const TextureRecord& texture; + EllipseAxes axes; + MipSelection mips; + unsigned num_mips; + + MipAnisoFilter(const TextureRecord& texture, Vec2 du, Vec2 dv); + vector_lite + generate_samples(unsigned mip_i, float u, float v) const; + +private: + friend struct FilteringUnitTestsAccess; + + static EllipseAxes ellipse_axes(const TextureRecord& texture, Vec2 du, + Vec2 dv); + static MipSelection compute_miplevels(const TextureRecord& texture, + const EllipseAxes& axes); + static EllipseSampling + compute_ellipse_sampling(const TextureRecord& texture, + const EllipseAxes& axes, int mip); + + + static int floor_div(int n, int d); + static float anisotropic_aspect(float major_rho, float minor_rho, + int max_aniso) + { + const float safe_minor = std::max(1.0f, minor_rho); + return std::clamp(major_rho / safe_minor, 1.0f, float(max_aniso)); + } +}; + +struct FilteringUnitTestsAccess { + static MipAnisoFilter::EllipseAxes + ellipse_axes(const TextureRecord& texture, Vec2 du, Vec2 dv) + { + return MipAnisoFilter::ellipse_axes(texture, du, dv); + } + + static MipAnisoFilter::MipSelection + compute_miplevels(const TextureRecord& texture, + const MipAnisoFilter::EllipseAxes& axes) + { + return MipAnisoFilter::compute_miplevels(texture, axes); + } + + static MipAnisoFilter::EllipseSampling + compute_ellipse_sampling(const TextureRecord& texture, + const MipAnisoFilter::EllipseAxes& axes, int mip) + { + return MipAnisoFilter::compute_ellipse_sampling(texture, axes, mip); + } +}; + +} // namespace texture_device \ No newline at end of file diff --git a/testsuite/texture-device/filtering_impl.h b/testsuite/texture-device/filtering_impl.h new file mode 100644 index 0000000000..ff2544cd30 --- /dev/null +++ b/testsuite/texture-device/filtering_impl.h @@ -0,0 +1,178 @@ +// Copyright Contributors to the OpenImageIO project. +// SPDX-License-Identifier: Apache-2.0 +// https://github.com/AcademySoftwareFoundation/OpenImageIO + +#pragma once + +#include "filtering_decl.h" +#include "texture_device_decl.h" +#include + +namespace texture_device { + +inline MipAnisoFilter::MipAnisoFilter(const TextureRecord& texture, Vec2 du, + Vec2 dv) + : texture(texture) +{ + axes = ellipse_axes(texture, du, dv); + mips = compute_miplevels(texture, axes); + num_mips = mips.mip_levels[1] == mips.mip_levels[0] ? 1 : 2; +} + +inline int +MipAnisoFilter::floor_div(int n, int d) +{ + int q = n / d; + int r = n % d; + if (r != 0 && ((r < 0) != (d < 0))) + --q; + return q; +} + +inline vector_lite +MipAnisoFilter::generate_samples(unsigned mip_i, float u, float v) const +{ + EllipseSampling e = compute_ellipse_sampling(texture, axes, + mips.mip_levels[mip_i]); + + + // Precompute normalized per-sample gaussian weights along the major axis. + vector_lite gaussian; + float gaussian_weight_sum = 0; + for (unsigned s = 0; s < e.nsamples; ++s) { + const float t = (s + 0.5f) / e.nsamples - 0.5f; + const float w = e.nsamples > 1 ? std::exp(-2.0f * t * t) : 1; + gaussian.push_back(w); + gaussian_weight_sum += w; + } + vector_lite samples; + + const uint32_t mip = mips.mip_levels[mip_i]; + const uint32_t base_w = std::max(1u, texture.width); + const uint32_t base_h = std::max(1u, texture.height); + const uint32_t level_w = std::max(1u, base_w >> mip); + const uint32_t level_h = std::max(1u, base_h >> mip); + + if (gaussian_weight_sum <= 0.0f) + return samples; + + unsigned s = 0; + for (const float gaussian_sample : gaussian) { + const float t = (e.nsamples > 1) + ? ((float(s) + 0.5f) / float(e.nsamples) - 0.5f) + : 0.0f; + const Vec2 uv = Vec2(u, v) + (2.0f * t * e.span_uv) * e.axis_uv; + const float gaussian_w = gaussian_sample / gaussian_weight_sum; + + const float fu = uv.x * float(level_w); + const float fv = uv.y * float(level_h); + const int x0 = int(std::floor(fu)); + const int y0 = int(std::floor(fv)); + const int x1 = x0 + 1; + const int y1 = y0 + 1; + const float tx = fu - float(x0); + const float ty = fv - float(y0); + + const int tap_x[4] = { x0, x1, x0, x1 }; + const int tap_y[4] = { y0, y0, y1, y1 }; + const float bilinear_w[4] = { (1.0f - tx) * (1.0f - ty), + tx * (1.0f - ty), (1.0f - tx) * ty, + tx * ty }; + + // Expand each anisotropic sample position into 4 bilinear taps. + for (int tap = 0; tap < 4; ++tap) { + const float w = gaussian_w * bilinear_w[tap]; + if (w <= 0.0f) + continue; + + bool x_ok = true; + bool y_ok = true; + const int wrapped_x = wrap_coord(tap_x[tap], int(level_w), + texture.swrap, x_ok); + const int wrapped_y = wrap_coord(tap_y[tap], int(level_h), + texture.twrap, y_ok); + if (!x_ok || !y_ok) + continue; + + const uint16_t tile_x = floor_div(wrapped_x, + int(TileRecord::kTileWidth)); + const uint16_t tile_y = floor_div(wrapped_y, + int(TileRecord::kTileHeight)); + const int local_x = wrapped_x + - tile_x * int(TileRecord::kTileWidth); + const int local_y = wrapped_y + - tile_y * int(TileRecord::kTileHeight); + + if (samples.size() >= samples.capacity()) + continue; + + samples.push_back({ + w, + TileCoords { texture.name.hash(), tile_x, tile_y, uint16_t(mip), + 0 }, + local_x, + local_y, + nullptr, + }); + } + ++s; + } + return samples; +} + +inline MipAnisoFilter::EllipseSampling +MipAnisoFilter::compute_ellipse_sampling(const TextureRecord& texture, + const EllipseAxes& axes, int mip) +{ + (void)texture; + const float mip_scale = 1.0f / float(1u << uint32_t(std::max(0, mip))); + const float aspect = anisotropic_aspect(axes.major_rho * mip_scale, + axes.minor_rho * mip_scale, + MipAnisoFilter::kMaxSamples); + const unsigned nsamples = std::clamp(unsigned(std::ceil(aspect)), 1u, + MipAnisoFilter::kMaxSamples); + + const float span_uv = 0.5f * axes.major_uv.length(); + return EllipseSampling { axes.major_uv, nsamples, span_uv }; +} + +inline MipAnisoFilter::EllipseAxes +MipAnisoFilter::ellipse_axes(const TextureRecord& texture, Vec2 du, Vec2 dv) +{ + const float tex_w = float(std::max(1u, texture.width)); + const float tex_h = float(std::max(1u, texture.height)); + const float rho_u = std::sqrt((du.x * tex_w) * (du.x * tex_w) + + (du.y * tex_h) * (du.y * tex_h)); + const float rho_v = std::sqrt((dv.x * tex_w) * (dv.x * tex_w) + + (dv.y * tex_h) * (dv.y * tex_h)); + + const bool major_is_u = rho_u >= rho_v; + const Vec2 major_uv = major_is_u ? du : dv; + const float major_rho = std::max(rho_u, rho_v); + const float minor_rho = std::max(1.0f, std::min(rho_u, rho_v)); + + const float major_uv_len = major_uv.length(); + Vec2 axis_uv = Vec2(1.0f, 0.0f); + if (major_uv_len > 0.0f) + axis_uv = major_uv / major_uv_len; + + return EllipseAxes { axis_uv, major_rho, minor_rho }; +} + +inline MipAnisoFilter::MipSelection +MipAnisoFilter::compute_miplevels(const TextureRecord& texture, + const EllipseAxes& axes) +{ + (void)texture; + const float rho = std::max(1.0f, axes.minor_rho); + + const float lod = std::max(0.0f, std::log2(rho)); + const uint32_t mip0 = std::clamp(uint32_t(std::floor(lod)), 0u, + kMaxMipLevel); + const uint32_t mip1 = std::min(kMaxMipLevel, mip0 + 1); + const float blend = std::clamp(lod - float(mip0), 0.0f, 1.0f); + return MipSelection { { mip0, mip1 }, blend }; +} + + +} // namespace texture_device \ No newline at end of file diff --git a/testsuite/texture-device/host.cpp b/testsuite/texture-device/host.cpp new file mode 100644 index 0000000000..dfbe489919 --- /dev/null +++ b/testsuite/texture-device/host.cpp @@ -0,0 +1,144 @@ +// Copyright Contributors to the OpenImageIO project. +// SPDX-License-Identifier: Apache-2.0 +// https://github.com/AcademySoftwareFoundation/OpenImageIO + +#include "arena.h" +#include "blend.h" +#include "texture_device_impl.h" // IWYU pragma: keep +#include "texture_loader.h" + +#include + +#include +#include +#include + +using OIIO::ImageOutput; +using OIIO::ImageSpec; +using OIIO::TypeDesc; +using OIIO::ustringhash; +using texture_device::blend_kernel; +using texture_device::BlendOp; +using texture_device::DTextureSystem; +using texture_device::Host; +using texture_device::MockDevice; +using texture_device::Request; +using texture_device::RGBA; +using texture_device::run_device_unit_tests; +using texture_device::tagged_ptr; +using texture_device::TextureLoader; + +namespace { + +bool +write_output(const std::string& filename, int width, int height, + const std::vector& pixels) +{ + auto out = ImageOutput::create(filename); + if (!out) + return false; + + ImageSpec spec(width, height, 4, TypeDesc::FLOAT); + if (!out->open(filename, spec)) + return false; + + bool ok = out->write_image(TypeDesc::FLOAT, pixels.data()); + ok = out->close() && ok; + return ok; +} + +} // namespace + +int +main() +{ + if (!run_device_unit_tests()) { + std::cout << "texture-device: unit-tests-failed\n"; + return 2; + } + + const int width = 256; + const int height = 256; + + Host host; + MockDevice device; + TextureLoader loader; + + BlendOp op; + DTextureSystem textures(host, device, op.texture_system); + static constexpr const char* kTextureSearchPaths[] = { + "../common/textures", + "../../../testsuite/common/textures", + "testsuite/common/textures", + }; + for (const char* path : kTextureSearchPaths) + loader.add_texture_path(path); + textures.request_queue().clear(); + + std::vector output(width * height, RGBA(0.0f, 0.0f, 0.0f, 0.0f)); + + op.width = width; + op.height = height; + op.name_a = ustringhash("grid.tx"); + op.name_b = ustringhash("checker.tx"); + op.output_buffer = nullptr; + textures.sync_to_managed(); + + const size_t output_bytes = output.size() * sizeof(RGBA); + tagged_ptr device_output = device.alloc(output_bytes, + "host::output_buffer"); + tagged_ptr device_op = device.alloc(sizeof(BlendOp), + "host::blend_op"); + device.copy_to(device_output, tagged_ptr(output.data(), "Host"), + output_bytes); + + op.output_buffer = device_output; + + constexpr int max_passes = 8; + int completed_passes = 0; + bool converged = false; + for (int pass = 0; pass < max_passes; ++pass) { + textures.begin_launch(); + device.copy_to(device_op, tagged_ptr(&op, "Host"), + sizeof(op)); + device.run(width, height, &blend_kernel, device_op); + device.copy_from(tagged_ptr(&op, "Host"), device_op, sizeof(op)); + ++completed_passes; + + textures.sync_from_managed(); + if (textures.needs_retry()) { + textures.sync_to_managed(); + continue; + } + + if (!textures.failures()) { + converged = true; + break; + } + + textures.process_requests( + [&](const Request& req, DTextureSystem* ts) { + return loader.process_request(req, ts); + }); + + textures.sync_to_managed(); + } + + if (!converged) + std::cout << "texture-device: retry-limit-hit\n"; + + device.copy_from(tagged_ptr(output.data(), "Host"), device_output, + output_bytes); + + (void)write_output("out.exr", width, height, output); + + device.free(device_op); + device.free(device_output); + + std::cout << "texture-device: startup-ok\n"; + std::cout << "texture-device: passes=" << completed_passes << "\n"; + std::cout << "texture-device: requests=" << textures.request_queue().size() + << "\n"; + std::cout << "texture-device: wrote out.exr\n"; + return 0; +} diff --git a/testsuite/texture-device/img/checker.jpg b/testsuite/texture-device/img/checker.jpg new file mode 100644 index 0000000000..cd245d41f3 Binary files /dev/null and b/testsuite/texture-device/img/checker.jpg differ diff --git a/testsuite/texture-device/img/grid.jpg b/testsuite/texture-device/img/grid.jpg new file mode 100644 index 0000000000..4f7427c1a2 Binary files /dev/null and b/testsuite/texture-device/img/grid.jpg differ diff --git a/testsuite/texture-device/img/result.jpg b/testsuite/texture-device/img/result.jpg new file mode 100644 index 0000000000..82091c2d82 Binary files /dev/null and b/testsuite/texture-device/img/result.jpg differ diff --git a/testsuite/texture-device/ref/out.exr b/testsuite/texture-device/ref/out.exr new file mode 100644 index 0000000000..539b820f8e Binary files /dev/null and b/testsuite/texture-device/ref/out.exr differ diff --git a/testsuite/texture-device/ref/out.txt b/testsuite/texture-device/ref/out.txt new file mode 100644 index 0000000000..761d3bb207 --- /dev/null +++ b/testsuite/texture-device/ref/out.txt @@ -0,0 +1,4 @@ +texture-device: startup-ok +texture-device: passes=3 +texture-device: requests=0 +texture-device: wrote out.exr diff --git a/testsuite/texture-device/run.py b/testsuite/texture-device/run.py new file mode 100644 index 0000000000..9a34b2a8c1 --- /dev/null +++ b/testsuite/texture-device/run.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python + +# Copyright Contributors to the OpenImageIO project. +# SPDX-License-Identifier: Apache-2.0 +# https://github.com/AcademySoftwareFoundation/OpenImageIO + +# texture-device is built by top-level CMake via add_subdirectory, so +# tests only need to run the already-built executable. +command += run_app(oiio_app("texture-device").strip()) + +outputs = [ "out.txt", "out.exr" ] diff --git a/testsuite/texture-device/stream.h b/testsuite/texture-device/stream.h new file mode 100644 index 0000000000..d1025c59f1 --- /dev/null +++ b/testsuite/texture-device/stream.h @@ -0,0 +1,264 @@ +// Copyright Contributors to the OpenImageIO project. +// SPDX-License-Identifier: Apache-2.0 +// https://github.com/AcademySoftwareFoundation/OpenImageIO + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "arena.h" + +namespace texture_device { + +#define OPT_FUNCT(condition, return_type) \ + template std::enable_if_t + +#define OPT_CONSTRUCT(condition) \ + template> + +#define OPT_FIELD(condition, field_type) \ + std::conditional_t + +template +class Stream { +public: + template friend class Stream; + + static constexpr bool IsManager + = !std::is_same::value; + using Managed = Stream; + + // Managed side functionality + + OPT_FUNCT(!IsManager, T&) + operator[](uint32_t i) + { + assert(i < m_size); + const uint32_t page_index = i / kPageSize; + const uint32_t off = i % kPageSize; + assert(page_index < num_pages()); + tagged_ptr page = m_pages[page_index]; + return (*page)[off]; + } + + OPT_FUNCT(!IsManager, const T&) + operator[](uint32_t i) const + { + assert(i < m_size); + const uint32_t page_index = i / kPageSize; + const uint32_t off = i % kPageSize; + assert(page_index < num_pages()); + tagged_ptr page = m_pages[page_index]; + return (*page)[off]; + } + + OPT_CONSTRUCT(!IsManager) + Stream() + : m_pages(nullptr) + , m_page_capacity(0) + , m_size(0) + , m_owner(false) + , m_arena(nullptr) + { + } + + OPT_CONSTRUCT(!IsManager) + Stream(Arena& arena) + : m_pages(nullptr) + , m_page_capacity(0) + , m_size(0) + , m_owner(false) + , m_arena(&arena) + { + } + + OPT_CONSTRUCT(!IsManager) + Stream(const Stream& o) + : m_pages(o.m_pages) + , m_page_capacity(o.m_page_capacity) + , m_size(o.m_size) + , m_owner(false) + , m_arena(o.m_arena) + { + } + + OPT_FUNCT(!IsManager, const Stream&) + operator=(const Stream & o) + { + m_pages = o.m_pages; + m_page_capacity = o.m_page_capacity; + m_size = o.m_size; + m_owner = false; + m_arena = o.m_arena; + return *this; + } + + // Manager side functionality + + OPT_CONSTRUCT(IsManager) + Stream(Arena& arena, ManagedArena& marena, Managed& managed) + : m_pages(nullptr) + , m_page_capacity(0) + , m_size(0) + , m_owner(true) // Manager always owns + , m_arena(&arena) + , m_managed(managed) + , m_staging_page(nullptr) + , m_staging_index(0) + { + // Initialize managed-side storage explicitly; assignment would force + // m_owner=false and break teardown invariants when no pages are grown. + m_managed.m_pages = nullptr; + m_managed.m_page_capacity = 0; + m_managed.m_size = 0; + m_managed.m_owner = false; + m_managed.m_arena = &marena; + m_staging_page = m_arena->alloc(sizeof(Page), "Stream::Stream()"); + } + + ~Stream() + { + if (m_owner) { + if constexpr (IsManager) { + for (uint32_t i = 0, end = num_pages(); i < end; ++i) + m_managed.m_arena->free(m_pages[i]); + assert(!m_managed.m_owner); + m_managed.m_arena->free(m_managed.m_pages); + m_managed.m_pages = nullptr; + m_arena->free(m_staging_page); + m_staging_page = nullptr; + } + m_arena->free(m_pages); + } + } + + OPT_FUNCT(IsManager, void) + clear() + { + if (m_pages) { + uint32_t npages = num_pages(); + for (uint32_t i = 0; i < npages; ++i) { + m_managed.m_arena->free(m_pages[i]); + m_pages[i] = nullptr; + } + if (m_managed.m_pages && m_page_capacity) { + m_managed.m_arena->copy_to(m_managed.m_pages, m_pages, + sizeof(tagged_ptr) + * m_page_capacity); + } + } + m_staging_index = m_size = m_managed.m_size = 0; + } + + OPT_FUNCT(IsManager, uint32_t) + push_back(const T& value) + { + // When a page fills, flush the staging page into managed-visible + // storage before starting the next one. + if (m_size && (m_size % kPageSize) == 0) + sync_stage(); + uint32_t req_page_capacity = (m_size + kPageSize) / kPageSize; + if (req_page_capacity > m_page_capacity) + grow_page_capacity(); + uint32_t page = req_page_capacity - 1; + if (!m_pages[page]) { + // Pages are allocated in managed arena memory so device-side reads + // can index them directly after sync. + m_pages[page] = m_managed.m_arena->alloc(sizeof(Page), + "Stream::push_back"); + m_managed.m_arena->copy_to(m_managed.m_pages, m_pages, + sizeof(tagged_ptr) + * m_page_capacity); + m_staging_index = 0; + } + (*m_staging_page)[m_staging_index] = value; + m_size++; + m_staging_index = m_size % kPageSize; + return m_size - 1; + } + + OPT_FUNCT(IsManager, void) + sync_to_managed() + { + // Only the mutable tail page needs to be flushed; sealed pages are + // already synchronized when they are completed. + if (m_size != m_managed.m_size) + sync_stage(); + } + OPT_FUNCT(IsManager, void) + sync_from_managed() {} + +private: + static constexpr uint32_t kPageSize = uint32_t((64ull * 1024ull * 1024ull) + / sizeof(T)); // 64MB pages + static_assert(kPageSize > 0, "Stream page size must be non-zero"); + + using Page = std::array; + + uint32_t num_pages() const + { + return m_size / kPageSize + (m_size % kPageSize ? 1 : 0); + } + + size_t grow_page_capacity() + { + // Both manager and managed + uint32_t newcap = m_page_capacity ? m_page_capacity * 2 : 2; + tagged_ptr> newp + = m_arena->alloc(sizeof(tagged_ptr) * newcap, + "Stream::grow_page_capacity"); + // Ensure newly allocated pointer slots start null. + std::fill_n(newp.get(), newcap, tagged_ptr(nullptr)); + m_arena->copy_in(newp, m_pages, + sizeof(tagged_ptr) * m_page_capacity); + m_page_capacity = newcap; + if (m_owner) + m_arena->free(m_pages); + m_pages = newp; + m_owner = true; + // Managed only + if constexpr (IsManager) { + m_managed.grow_page_capacity(); + m_managed.m_owner = false; + assert(m_page_capacity == m_managed.m_page_capacity); + } + return m_page_capacity; + } + + void sync_stage() + { + if (m_size == 0) + return; + const uint32_t last_page = (m_size - 1) / kPageSize; + assert(last_page < num_pages()); + // The staging page mirrors the current tail page and is copied as a + // whole page for simplicity. + m_managed.m_arena->copy_to(m_pages[last_page], m_staging_page, + sizeof(Page)); + m_managed.m_size = m_size; + } + + // Both managed and manager + tagged_ptr> m_pages; + uint32_t m_page_capacity; + uint32_t m_size; + bool m_owner; + Arena* m_arena; + + // Manager only + OPT_FIELD(IsManager, Managed&) m_managed; + OPT_FIELD(IsManager, tagged_ptr) m_staging_page; + OPT_FIELD(IsManager, uint32_t) m_staging_index; +}; + +} // namespace texture_device + +#undef OPT_FIELD +#undef OPT_CONSTRUCT +#undef OPT_FUNCT diff --git a/testsuite/texture-device/tagged_ptr.h b/testsuite/texture-device/tagged_ptr.h new file mode 100644 index 0000000000..8fd4868a6d --- /dev/null +++ b/testsuite/texture-device/tagged_ptr.h @@ -0,0 +1,171 @@ +// Copyright Contributors to the OpenImageIO project. +// SPDX-License-Identifier: Apache-2.0 +// https://github.com/AcademySoftwareFoundation/OpenImageIO + +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace texture_device { + +#define OPT_FUNCT(condition, return_type) \ + template std::enable_if_t + +extern uint64_t g_tagged_ptr_context; + +inline uint64_t +ptrtag(const char* s) +{ + if (!s || !s[0]) + return 0; + return OIIO::Strutil::strhash64(std::strlen(s), s); +} + +template +inline constexpr uint64_t +ptrtag(const char (&s)[N]) +{ + static_assert(N > 1, "tag literal must be non-empty"); + return OIIO::Strutil::strhash64(N - 1, s); +} + +template class tagged_ptr { +public: + using element_type = T; + static constexpr bool IsVoid = std::is_void::value; + static constexpr bool IsConst = std::is_const::value; + using ElementRef = std::add_lvalue_reference_t; + using ElementPtr = std::add_pointer_t; + + tagged_ptr() = default; + tagged_ptr(std::nullptr_t) + : m_ptr(nullptr) + , m_tag(0) + { + } + + template::value>> + tagged_ptr(U* p) = delete; + + template::value>> + tagged_ptr(U* p, const char* context_tag) + : m_ptr(p) + , m_tag(ptrtag(context_tag)) + { + } + + template::value>> + tagged_ptr(const tagged_ptr& other) + : m_ptr(other.get()) + , m_tag(other.tag()) + { + } + + template::value>> + tagged_ptr& operator=(const tagged_ptr& other) + { + m_ptr = other.get(); + m_tag = other.tag(); + return *this; + } + + tagged_ptr(const tagged_ptr& other) + : m_ptr(static_cast(other.get())) + , m_tag(other.tag()) + { + } + + OPT_FUNCT(!IsVoid, tagged_ptr&) + operator=(const tagged_ptr& other) + { + m_ptr = static_cast(other.get()); + m_tag = other.tag(); + return *this; + } + + tagged_ptr(const tagged_ptr& other) + : m_ptr(static_cast(other.get())) + , m_tag(other.tag()) + { + } + + OPT_FUNCT(!IsVoid && IsConst, tagged_ptr&) + operator=(const tagged_ptr& other) + { + m_ptr = static_cast(other.get()); + m_tag = other.tag(); + return *this; + } + + T* get() const { return m_ptr; } + + explicit operator bool() const { return m_ptr != nullptr; } + + bool operator==(std::nullptr_t) const { return m_ptr == nullptr; } + bool operator!=(std::nullptr_t) const { return m_ptr != nullptr; } + + bool operator==(const tagged_ptr& other) const + { + return m_ptr == other.get() && m_tag == other.tag(); + } + + bool operator!=(const tagged_ptr& other) const + { + return !(*this == other); + } + + bool is(uint64_t tag) const { return m_tag == tag; } + bool is(const char* context_tag) const + { + return m_tag == ptrtag(context_tag); + } + uint64_t tag() const { return m_tag; } + + OPT_FUNCT(!IsVoid, ElementRef) + operator*() const + { + check_deref_allowed(); + return *m_ptr; + } + + OPT_FUNCT(!IsVoid, ElementPtr) + operator->() const + { + check_deref_allowed(); + return m_ptr; + } + + OPT_FUNCT(!IsVoid, ElementRef) + operator[](size_t i) const + { + check_deref_allowed(); + return m_ptr[i]; + } + +private: + template friend class tagged_ptr; + + void check_deref_allowed() const + { + // Enforce explicit host/device context boundaries at dereference time. + if (m_tag != g_tagged_ptr_context) + std::abort(); + } + + T* m_ptr = nullptr; + uint64_t m_tag = 0; +}; + +} // namespace texture_device + +#undef OPT_FUNCT diff --git a/testsuite/texture-device/texture_device_decl.h b/testsuite/texture-device/texture_device_decl.h new file mode 100644 index 0000000000..90abdeaf9e --- /dev/null +++ b/testsuite/texture-device/texture_device_decl.h @@ -0,0 +1,271 @@ +// Copyright Contributors to the OpenImageIO project. +// SPDX-License-Identifier: Apache-2.0 +// https://github.com/AcademySoftwareFoundation/OpenImageIO + +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "arena.h" +#include "closed_hashmap.h" +#include "filtering_decl.h" +#include "stream.h" + +namespace texture_device { + +#define OPT_FUNCT(condition, return_type) \ + template std::enable_if_t + +#define OPT_CONSTRUCT(condition) \ + template> + +#define OPT_FIELD(condition, field_type) \ + std::conditional_t + +#define OPT_FUNCT_DECL(condition) \ + template> + +using RGBA = Imath::C4f; +using Vec2 = Imath::V2f; + +struct TileCoordsHash { + size_t operator()(const TileCoords& c) const; +}; + +enum class RequestType : uint8_t { MissingTexture, MissingTile }; + +struct Request { + RequestType type = RequestType::MissingTexture; + TileCoords tile; + + bool operator==(const Request& other) const; +}; + +struct RequestHash { + size_t operator()(const Request& req) const; + static uint64_t hash_mix_u64(uint64_t h, uint64_t v); +}; + +enum class WrapMode : uint8_t { Clamp, Periodic, Black }; + +int +wrap_coord(int coord, int size, WrapMode mode, bool& in_range); + +struct TileRecord { + static constexpr uint32_t kTileWidth = 64; + static constexpr uint32_t kTileHeight = 64; + TileCoords tile; + std::array pixels {}; +}; + +struct TextureRecord { + OIIO::ustringhash name; + bool ready = false; + uint32_t width = 0; + uint32_t height = 0; + WrapMode swrap = WrapMode::Clamp; + WrapMode twrap = WrapMode::Clamp; + + void reset(OIIO::ustringhash texture_name) + { + name = texture_name; + ready = false; + width = 0; + height = 0; + swrap = WrapMode::Periodic; + twrap = WrapMode::Periodic; + } +}; + +template struct DTextureSystemTestAccess; + +template +class DTextureSystem { +public: + template friend class DTextureSystem; + template friend struct DTextureSystemTestAccess; + + template using Atomic = typename Arena::template Atomic; + using Managed = DTextureSystem; + static constexpr bool IsManager + = !std::is_same::value; + + static constexpr uint32_t kMaxResidentTextures = 32; + static constexpr uint32_t kMaxResidentTiles = 2048; + static constexpr uint32_t kMaxTilesPerTexture = 256; + static constexpr uint32_t kMaxRequests = 1024; + static constexpr uint32_t kTileWidth = TileRecord::kTileWidth; + static constexpr uint32_t kTileHeight = TileRecord::kTileHeight; + + using RequestQueue + = ClosedHashMap; + using TextureMap = ClosedHashMap, + Arena, ManagedArena>; + + // Managed only functionality + + OPT_CONSTRUCT(!IsManager) + DTextureSystem() + : m_failed(false) + , m_texture_count(0) + , m_tile_count(0) + { + } + + OPT_CONSTRUCT(!IsManager) + DTextureSystem(Arena& arena) + : m_queue(arena, kMaxRequests) + , m_failed(false) + , m_texture_count(0) + , m_texture_lookup(arena, kMaxResidentTextures) + , m_tile_pool(arena) + , m_tile_index(arena, kMaxResidentTiles) + , m_tile_count(0) + { + std::fill(std::begin(m_textures), std::end(m_textures), + TextureRecord {}); + } + + OPT_CONSTRUCT(!IsManager) + DTextureSystem(const DTextureSystem& o) + { + m_queue = o.m_queue; + m_failed = o.m_failed.load(); + memcpy(m_textures, o.m_textures, sizeof(m_textures)); + m_texture_count = o.m_texture_count; + m_texture_lookup = o.m_texture_lookup; + m_tile_pool = o.m_tile_pool; + m_tile_index = o.m_tile_index; + m_tile_count = o.m_tile_count; + } + + + OPT_FUNCT(!IsManager, DTextureSystem&) + operator=(const DTextureSystem & o) + { + m_queue = o.m_queue; + m_failed = o.m_failed.load(); + memcpy(m_textures, o.m_textures, sizeof(m_textures)); + m_texture_count = o.m_texture_count; + m_texture_lookup = o.m_texture_lookup; + m_tile_pool = o.m_tile_pool; + m_tile_index = o.m_tile_index; + m_tile_count = o.m_tile_count; + return *this; + } + + // Manager only functionality + + OPT_CONSTRUCT(IsManager) + DTextureSystem(Arena& arena, ManagedArena& marena, Managed& managed) + : m_managed(managed) + , m_queue(arena, marena, kMaxRequests, m_managed.m_queue) + , m_failed(false) + , m_texture_count(0) + , m_texture_lookup(arena, marena, kMaxResidentTextures, + m_managed.m_texture_lookup) + , m_tile_pool(arena, marena, m_managed.m_tile_pool) + , m_tile_index(arena, marena, kMaxResidentTiles, m_managed.m_tile_index) + , m_tile_count(0) + { + std::fill(std::begin(m_textures), std::end(m_textures), + TextureRecord {}); + } + + OPT_FUNCT(IsManager, void) begin_launch() { m_failed.store(false); } + OPT_FUNCT_DECL(IsManager) + bool set_texture_ready(OIIO::ustringhash name, uint32_t width, + uint32_t height); + OPT_FUNCT_DECL(IsManager) + bool set_tile_payload(OIIO::ustringhash name, TileCoords tile, int width, + int height, const std::vector& pixels); + OPT_FUNCT(IsManager, bool) + needs_retry() + { + if (!m_queue.overflowed()) + return false; + if (!m_queue.grow()) + return false; + + // Overflow indicates dropped requests; clear and re-run to recollect. + m_queue.clear(); + m_failed.store(false); + return true; + } + OPT_FUNCT_DECL(IsManager) + bool find_or_add_texture(OIIO::ustringhash name, uint32_t& index); + OPT_FUNCT_DECL(IsManager) + void sync_to_managed(); + OPT_FUNCT_DECL(IsManager) + void sync_from_managed(); + + template> + bool process_requests(Func&& fn) + { + RequestQueue& queue = request_queue(); + for (const Request& req : queue) { + if (!fn(req, this)) + return false; + } + queue.clear(); + return true; + } + + // Both managed and manager + + RequestQueue& request_queue() { return m_queue; } + const RequestQueue& request_queue() const { return m_queue; } + + bool failures() const + { + const RequestQueue& queue = request_queue(); + return m_failed.load() || queue.failed(); + } + RGBA lookup(OIIO::ustringhash name, float u, float v, Vec2 du, Vec2 dv, + float rnd = -1); + + using TilePool = Stream; + using TileIndexMap = ClosedHashMap; + +private: + bool find_texture(OIIO::ustringhash name, uint32_t& index) const; + + static void copy_tile_pixels(TileRecord& dst, + const std::vector& pixels); + + template bool load_tiles(SampleArray& samples); + + // Manager only + OPT_FIELD(IsManager, Managed&) m_managed; + // Both + RequestQueue m_queue; + Atomic m_failed; + TextureRecord m_textures[kMaxResidentTextures]; + uint32_t m_texture_count; + TextureMap m_texture_lookup; + TilePool m_tile_pool; + TileIndexMap m_tile_index; + uint32_t m_tile_count; +}; + +bool +run_device_unit_tests(); + +} // namespace texture_device + +#undef OPT_FIELD +#undef OPT_CONSTRUCT +#undef OPT_FUNCT_DECL +#undef OPT_FUNCT diff --git a/testsuite/texture-device/texture_device_impl.h b/testsuite/texture-device/texture_device_impl.h new file mode 100644 index 0000000000..01b44d9dab --- /dev/null +++ b/testsuite/texture-device/texture_device_impl.h @@ -0,0 +1,332 @@ +// Copyright Contributors to the OpenImageIO project. +// SPDX-License-Identifier: Apache-2.0 +// https://github.com/AcademySoftwareFoundation/OpenImageIO + +#pragma once + +#include "texture_device_decl.h" + +#include +#include + +// Just to reduce verbosity +#define DTS DTextureSystem +#define OPT_FUNCT_IMPL template + +namespace texture_device { + +inline size_t +TileCoordsHash::operator()(const TileCoords& c) const +{ + uint64_t h + = OIIO::farmhash::inlined::Hash64(reinterpret_cast(&c), + sizeof(TileCoords)); + return static_cast(h); +} + +inline bool +Request::operator==(const Request& other) const +{ + return type == other.type && tile == other.tile; +} + +inline size_t +RequestHash::operator()(const Request& req) const +{ + uint64_t h = 1469598103934665603ull; + h = hash_mix_u64(h, static_cast(req.type)); + h = hash_mix_u64(h, req.tile.texture_hash); + h = hash_mix_u64(h, + static_cast(static_cast(req.tile.x))); + h = hash_mix_u64(h, + static_cast(static_cast(req.tile.y))); + h = hash_mix_u64(h, static_cast( + static_cast(req.tile.mip))); + return static_cast(h); +} + +inline uint64_t +RequestHash::hash_mix_u64(uint64_t h, uint64_t v) +{ + h ^= v + 0x9e3779b97f4a7c15ull + (h << 6) + (h >> 2); + return h; +} + +template +OPT_FUNCT_IMPL inline bool +DTS::set_texture_ready(OIIO::ustringhash name, uint32_t width, uint32_t height) +{ + uint32_t index = 0; + if (!find_or_add_texture(name, index)) + return false; + if (width == 0 || height == 0) + return false; + m_textures[index].width = width; + m_textures[index].height = height; + m_textures[index].swrap = WrapMode::Periodic; + m_textures[index].twrap = WrapMode::Periodic; + m_textures[index].ready = true; + return true; +} + +template +OPT_FUNCT_IMPL inline bool +DTS::set_tile_payload(OIIO::ustringhash name, TileCoords tile, int width, + int height, const std::vector& pixels) +{ + if (width != int(kTileWidth) || height != int(kTileHeight)) + return false; + if (pixels.size() != size_t(width) * size_t(height)) + return false; + + uint32_t index = 0; + if (!find_texture(name, index)) + return false; + TextureRecord& texture = m_textures[index]; + if (!texture.ready) + return false; + + uint32_t tile_pool_index = 0; + if (m_tile_index.find(tile, tile_pool_index)) { + if (tile_pool_index >= m_tile_count) + return false; + return true; + } + + if (m_tile_count >= kMaxResidentTiles) + return false; + + if (m_tile_index.size() >= kMaxTilesPerTexture) + return false; + + tile_pool_index = static_cast(m_tile_count); + if (!m_tile_index.insert(tile, tile_pool_index)) + return false; + + TileRecord record; + record.tile = tile; + copy_tile_pixels(record, pixels); + + m_tile_pool.push_back(record); + ++m_tile_count; + + return true; +} + +template +OPT_FUNCT_IMPL inline bool +DTextureSystem::find_or_add_texture(OIIO::ustringhash name, + uint32_t& index) +{ + if (find_texture(name, index)) + return true; + + if (m_texture_count >= kMaxResidentTextures) + return false; + + const uint32_t new_index = m_texture_count++; + m_textures[new_index].reset(name); + if (!m_texture_lookup.insert(name.hash(), new_index)) + return false; + index = new_index; + return true; +} + +template +OPT_FUNCT_IMPL inline void +DTextureSystem::sync_to_managed() +{ + m_queue.sync_to_managed(); + m_texture_lookup.sync_to_managed(); + m_tile_pool.sync_to_managed(); + m_tile_index.sync_to_managed(); + m_managed.m_failed = m_failed.load(); + memcpy(m_managed.m_textures, m_textures, sizeof(m_textures)); + m_managed.m_texture_count = m_texture_count; + m_managed.m_tile_count = m_tile_count; +} + +template +OPT_FUNCT_IMPL inline void +DTextureSystem::sync_from_managed() +{ + m_queue.sync_from_managed(); + m_texture_lookup.sync_from_managed(); + m_tile_pool.sync_from_managed(); + m_tile_index.sync_from_managed(); + m_failed = m_managed.m_failed.load(); + memcpy(m_textures, m_managed.m_textures, sizeof(m_textures)); + m_texture_count = m_managed.m_texture_count; + m_tile_count = m_managed.m_tile_count; +} + +template +inline bool +DTS::find_texture(OIIO::ustringhash name, uint32_t& index) const +{ + uint32_t out = 0; + if (!m_texture_lookup.find(name.hash(), out)) + return false; + if (out >= m_texture_count) + return false; + if (m_textures[out].name != name) + return false; + index = out; + return true; +} + +template +inline void +DTS::copy_tile_pixels(TileRecord& dst, const std::vector& pixels) +{ + std::copy(pixels.begin(), pixels.end(), dst.pixels.begin()); +} + +inline int +wrap_coord(int coord, int size, WrapMode mode, bool& in_range) +{ + in_range = true; + if (size <= 0) { + in_range = false; + return 0; + } + if (mode == WrapMode::Clamp) + return std::clamp(coord, 0, size - 1); + if (mode == WrapMode::Periodic) { + const int m = coord % size; + return (m < 0) ? (m + size) : m; + } + if (coord < 0 || coord >= size) + in_range = false; + return coord; +} + +template +template +inline bool +DTS::load_tiles(SampleArray& samples) +{ + bool missing_any = false; + RequestQueue& queue = request_queue(); + for (size_t i = 0; i < samples.size(); ++i) { + if (samples[i].tile || samples[i].weight == 0) + continue; + + uint32_t tile_pool_index = 0; + const bool found = m_tile_index.find(samples[i].tcoords, + tile_pool_index) + && tile_pool_index < m_tile_count; + + // Apply the same tile-resolution result to all duplicate tap entries + // with identical tile coordinates in this sample batch. + for (size_t j = i; j < samples.size(); ++j) { + if (!(samples[j].tcoords == samples[i].tcoords)) + continue; + if (found) { + samples[j].tile = &m_tile_pool[tile_pool_index]; + } else { + samples[j].weight = 0; + } + } + + if (!found) { + missing_any = true; + // RequestQueue deduplicates repeated misses across pixels/taps. + if (!queue.insert(Request { RequestType::MissingTile, + samples[i].tcoords }, + true)) { + m_failed.store(true); + } + } + } + return !missing_any; +} + +template +inline RGBA +DTS::lookup(OIIO::ustringhash name, float u, float v, Vec2 du, Vec2 dv, + float rnd) +{ + uint32_t texture_index = 0; + if (find_texture(name, texture_index)) { + const TextureRecord& texture = m_textures[texture_index]; + if (texture.ready) { + RGBA accum(0.0f, 0.0f, 0.0f, 0.0f); + bool failure = false; + MipAnisoFilter filter(texture, du, dv); + if (rnd >= 0.0f && filter.num_mips > 1) { + // Experimental path: stochastic mip choice collapses trilinear + // blending to one selected mip for this lookup. + const float blend = filter.mips.mip_blend; + assert(0 <= rnd && rnd < 1); + unsigned selected = rnd < blend ? 0 : 1; + rnd = selected ? (rnd - blend) / (1 - blend) : rnd / blend; + filter.mips.mip_levels[0] = filter.mips.mip_levels[selected]; + filter.num_mips = 1; // This will make the weight 1.0 + } + for (unsigned mip_i = 0; mip_i < filter.num_mips; ++mip_i) { + const float mip_weight + = filter.num_mips == 1 + ? 1 + : (mip_i ? filter.mips.mip_blend + : 1 - filter.mips.mip_blend); + auto samples = filter.generate_samples(mip_i, u, v); + + if (!load_tiles(samples)) { + failure = true; + continue; + } + + if (rnd >= 0) { + // Experimental path: select one tap by cumulative + // distribution instead of deterministic weighted sum. + float total_weight = 0; + for (size_t i = 0; i < samples.size(); ++i) + total_weight += samples[i].weight; + float sum = 0; + for (size_t i = 0; i < samples.size(); ++i) { + sum += samples[i].weight; + if (rnd < sum / total_weight + || i == samples.size() - 1) { + const size_t idx = size_t(samples[i].local_y) + * size_t(kTileWidth) + + size_t(samples[i].local_x); + accum = samples[i].tile->pixels[idx]; + break; + } + } + } else { + for (size_t i = 0; i < samples.size(); ++i) { + if (!samples[i].tile) + continue; + const size_t idx = size_t(samples[i].local_y) + * size_t(kTileWidth) + + size_t(samples[i].local_x); + accum += mip_weight * samples[i].weight + * samples[i].tile->pixels[idx]; + } + } + } + + return failure ? RGBA(1.0f, 0.0f, 1.0f, 1.0f) : accum; + } + } + + Request req; + req.type = (find_texture(name, texture_index) + && m_textures[texture_index].ready) + ? RequestType::MissingTile + : RequestType::MissingTexture; + req.tile = TileCoords { name.hash() }; + + RequestQueue& queue = request_queue(); + if (!queue.insert(req, true)) + m_failed.store(true); + + return RGBA(1.0f, 0.0f, 1.0f, 1.0f); +} + +#undef DTS +#undef OPT_FUNCT_IMPL + +} // namespace texture_device diff --git a/testsuite/texture-device/texture_loader.cpp b/testsuite/texture-device/texture_loader.cpp new file mode 100644 index 0000000000..cd8b1256e7 --- /dev/null +++ b/testsuite/texture-device/texture_loader.cpp @@ -0,0 +1,165 @@ +// Copyright Contributors to the OpenImageIO project. +// SPDX-License-Identifier: Apache-2.0 +// https://github.com/AcademySoftwareFoundation/OpenImageIO + +#include "texture_loader.h" + +#include + +#include +#include +#include + +namespace texture_device { + +namespace { + + bool loader_error(const std::string& msg) + { + std::cerr << "texture-device: " << msg << "\n"; + return false; + } + +} // namespace + +void +TextureLoader::add_texture_path(std::string path) +{ + if (!path.empty()) + m_texture_paths.emplace_back(std::move(path)); +} + +bool +TextureLoader::resolve_texture(uint64_t texture_hash, + OIIO::ustringhash& texture_name, + std::string& filename) const +{ + texture_name = OIIO::ustringhash::from_hash(texture_hash); + const char* basename = texture_name.c_str(); + if (!basename || !basename[0]) + return false; + + for (const std::string& path : m_texture_paths) { + const std::filesystem::path fullpath = std::filesystem::path(path) + / basename; + if (!std::filesystem::exists(fullpath)) + continue; + filename = fullpath.string(); + return true; + } + + return false; +} + +bool +TextureLoader::query_texture_info(const std::string& filename, int& width, + int& height) +{ + // Cache dimensions/validation for repeated requests to the same file. + auto cache_it = m_metadata_cache.find(filename); + if (cache_it != m_metadata_cache.end()) { + width = cache_it->second.width; + height = cache_it->second.height; + return true; + } + + auto in = OIIO::ImageInput::open(filename); + if (!in) + return false; + + const OIIO::ImageSpec& spec = in->spec(); + if (spec.width <= 0 || spec.height <= 0 || spec.nchannels <= 0) { + in->close(); + return false; + } + if (spec.nchannels != 3 && spec.nchannels != 4) { + in->close(); + return loader_error("unsupported channel count for " + filename + + " got " + std::to_string(spec.nchannels) + + " expected 3 or 4"); + } + + if (spec.tile_width != int(TileRecord::kTileWidth) + || spec.tile_height != int(TileRecord::kTileHeight)) { + in->close(); + return loader_error("unsupported tile size for " + filename + " got " + + std::to_string(spec.tile_width) + "x" + + std::to_string(spec.tile_height) + " expected " + + std::to_string(TileRecord::kTileWidth) + "x" + + std::to_string(TileRecord::kTileHeight)); + } + + width = spec.width; + height = spec.height; + in->close(); + m_metadata_cache[filename] = TextureMetadata { width, height }; + return true; +} + +bool +TextureLoader::load_tile_payload(const std::string& filename, TileCoords tile, + std::vector& tile_pixels) +{ + int width = 0; + int height = 0; + if (!query_texture_info(filename, width, height)) + return false; + + auto in = OIIO::ImageInput::open(filename); + if (!in) + return false; + + constexpr int tile_w = int(TileRecord::kTileWidth); + constexpr int tile_h = int(TileRecord::kTileHeight); + const int mip = std::max(0u, unsigned(tile.mip)); + if (!in->seek_subimage(0, mip)) { + in->close(); + return loader_error("failed to seek mip " + std::to_string(mip) + + " for " + filename); + } + const OIIO::ImageSpec mipspec = in->spec(); + + if (mipspec.tile_width != tile_w || mipspec.tile_height != tile_h) { + in->close(); + return loader_error( + "unsupported mip tile size for " + filename + " mip " + + std::to_string(mip) + " got " + std::to_string(mipspec.tile_width) + + "x" + std::to_string(mipspec.tile_height) + " expected " + + std::to_string(tile_w) + "x" + std::to_string(tile_h)); + } + + const int tile_x = mipspec.x + tile.x * tile_w; + const int tile_y = mipspec.y + tile.y * tile_h; + const int nchannels = std::max(1, mipspec.nchannels); + if (nchannels != 3 && nchannels != 4) { + in->close(); + return loader_error("unsupported channel count for " + filename + + " mip " + std::to_string(mip) + " got " + + std::to_string(nchannels) + " expected 3 or 4"); + } + + std::vector raw(size_t(tile_w) * size_t(tile_h) * size_t(nchannels), + 0.0f); + // Read one tile in float and normalize payload to RGBA for device lookup. + const bool ok = in->read_tile(tile_x, tile_y, 0, OIIO::TypeDesc::FLOAT, + raw.data()); + in->close(); + if (!ok) + return false; + + tile_pixels.assign(size_t(tile_w * tile_h), RGBA(0.0f, 0.0f, 0.0f, 0.0f)); + for (int y = 0; y < tile_h; ++y) { + for (int x = 0; x < tile_w; ++x) { + const size_t p = size_t(y) * size_t(tile_w) + size_t(x); + const size_t o = p * size_t(nchannels); + const float r = raw[o + 0]; + const float g = raw[o + 1]; + const float b = raw[o + 2]; + const float a = (nchannels == 4) ? raw[o + 3] : 1.0f; + tile_pixels[p] = RGBA(r, g, b, a); + } + } + return true; +} + +} // namespace texture_device diff --git a/testsuite/texture-device/texture_loader.h b/testsuite/texture-device/texture_loader.h new file mode 100644 index 0000000000..204b5ffa41 --- /dev/null +++ b/testsuite/texture-device/texture_loader.h @@ -0,0 +1,73 @@ +// Copyright Contributors to the OpenImageIO project. +// SPDX-License-Identifier: Apache-2.0 +// https://github.com/AcademySoftwareFoundation/OpenImageIO + +#pragma once + +#include "texture_device_decl.h" + +#include + +#include +#include +#include + +namespace texture_device { + +class TextureLoader { +public: + using RGBA = texture_device::RGBA; + + void add_texture_path(std::string path); + + bool resolve_texture(uint64_t texture_hash, OIIO::ustringhash& texture_name, + std::string& filename) const; + + bool query_texture_info(const std::string& filename, int& width, + int& height); + + bool load_tile_payload(const std::string& filename, TileCoords tile, + std::vector& tile_pixels); + + template + bool process_request(const Request& req, TextureSystem* ts) + { + const uint64_t key = req.tile.texture_hash; + OIIO::ustringhash texture_name; + std::string filename; + if (!resolve_texture(key, texture_name, filename)) + return false; + + switch (req.type) { + case RequestType::MissingTexture: { + int tex_w = 0; + int tex_h = 0; + if (!query_texture_info(filename, tex_w, tex_h)) + return false; + return ts->set_texture_ready(texture_name, uint32_t(tex_w), + uint32_t(tex_h)); + } + case RequestType::MissingTile: { + std::vector tile_pixels; + if (!load_tile_payload(filename, req.tile, tile_pixels)) + return false; + return ts->set_tile_payload(texture_name, req.tile, + int(TextureSystem::kTileWidth), + int(TextureSystem::kTileHeight), + tile_pixels); + } + default: return false; + } + } + +private: + struct TextureMetadata { + int width = 0; + int height = 0; + }; + + std::vector m_texture_paths; + std::unordered_map m_metadata_cache; +}; + +} // namespace texture_device diff --git a/testsuite/texture-device/vector_lite.h b/testsuite/texture-device/vector_lite.h new file mode 100644 index 0000000000..cabda436ef --- /dev/null +++ b/testsuite/texture-device/vector_lite.h @@ -0,0 +1,52 @@ +// Copyright Contributors to the OpenImageIO project. +// SPDX-License-Identifier: Apache-2.0 +// https://github.com/AcademySoftwareFoundation/OpenImageIO + +#pragma once + +#include +#include +#include +#include + +namespace texture_device { + +template struct vector_lite : public std::array { + using Base = std::array; + + vector_lite() + : Base {} + , m_size(0) + { + } + + size_t size() const { return m_size; } + size_t capacity() const { return N; } + bool empty() const { return m_size == 0; } + + void clear() { m_size = 0; } + + void push_back(const T& value) + { + assert(m_size < N); + (*this)[m_size++] = value; + } + + void push_back(T&& value) + { + assert(m_size < N); + (*this)[m_size++] = std::move(value); + } + + T* begin() { return this->data(); } + T* end() { return this->data() + m_size; } + const T* begin() const { return this->data(); } + const T* end() const { return this->data() + m_size; } + const T* cbegin() const { return begin(); } + const T* cend() const { return end(); } + +private: + unsigned m_size; +}; + +} // namespace texture_device