From 6d61b6112703ca2e51b92475421c3998ad874442 Mon Sep 17 00:00:00 2001 From: Felix Schlepper Date: Mon, 15 Jun 2026 11:22:22 +0200 Subject: [PATCH 1/2] ITS: make parallelisation for final track fit more consistent with the rest of the code and let TBB decide on the grain size. Signed-off-by: Felix Schlepper --- .../tracking/include/ITStracking/Constants.h | 27 ++++++----- .../ITSMFT/ITS/tracking/src/TrackerTraits.cxx | 45 ++++++++++--------- 2 files changed, 39 insertions(+), 33 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Constants.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Constants.h index dfbaab4601e85..a85578deea9a2 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Constants.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Constants.h @@ -27,18 +27,21 @@ constexpr float MB = KB * KB; constexpr float GB = MB * KB; constexpr bool DoTimeBenchmarks = true; constexpr bool SaveTimeBenchmarks = false; -constexpr float Tolerance = 1e-12; // numerical tolerance -constexpr int ClustersPerCell = 3; // number of clusters for a cell -constexpr float MaxTrackSeedQ2Pt = 1.e3f; // maximum q/pt for track seeds -constexpr int UnusedIndex = -1; // global unused flag -constexpr float UnsetValue = -999.f; // global unset value -constexpr float Radl = 9.36f; // Radiation length of Si [cm] -constexpr float Rho = 2.33f; // Density of Si [g/cm^3] -constexpr int MaxIter = 4; // Max. supported iterations -constexpr int MaxSelectedTrackletsPerCluster = 100; // vertexer: max lines per cluster -constexpr int GPUBlocks = 60; // default CUDA/HIP launch blocks -constexpr int GPUThreads = 256; // default CUDA/HIP launch threads -constexpr int GPUThreadsTotal = GPUBlocks * GPUThreads; +constexpr float Tolerance = 1e-12; // numerical tolerance +constexpr int ClustersPerCell = 3; // number of clusters for a cell +constexpr int UnusedIndex = -1; // global unused flag +constexpr float UnsetValue = -999.f; // global unset value +constexpr float Radl = 9.36f; // Radiation length of Si [cm] +constexpr float Rho = 2.33f; // Density of Si [g/cm^3] +constexpr int MaxIter = 4; // Max. supported iterations +constexpr int MaxSelectedTrackletsPerCluster = 100; // vertexer: max lines per cluster +constexpr int NumberOfConcurrentSeeds = 16; // default split per worker for the final track fit/extraploation step +constexpr int MinNumberOfConcurrentSeeds = (1 << 8); // minimum chunk size for a worker for the final track fit/extraploation step +constexpr int MaxNumberOfConcurrentSeeds = (1 << 12); // maximum chunk size for a worker for the final track fit/extraploation step +constexpr float MaxTrackSeedQ2Pt = 1.e3f; // maximum q/pt for track seeds +constexpr int GPUBlocks = 60; // default CUDA/HIP launch blocks +constexpr int GPUThreads = 256; // default CUDA/HIP launch threads +constexpr int GPUThreadsTotal = GPUBlocks * GPUThreads; // default CUDA/HIP total launched threads namespace helpers { diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx index 181417fbe16a0..b3043fcc3c010 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx @@ -15,7 +15,6 @@ #include #include -#include #include #include #include @@ -25,6 +24,7 @@ #include #include +#include #include "DetectorsBase/Propagator.h" #include "GPUCommonMath.h" @@ -812,31 +812,34 @@ void TrackerTraits::findRoads(const int iteration) bounded_vector tracks(mMemoryPool.get()); mTaskArena->execute([&] { const int nSeeds = static_cast(trackSeeds.size()); - const int nWorkers = std::min(static_cast(mTaskArena->max_concurrency()), nSeeds); - const int chunkSize = std::min(nSeeds, std::clamp(nSeeds / (16 * nWorkers), 256, 4096)); - std::atomic nextSeed{0}; + const int maxConcurrency = std::max(1, mTaskArena->max_concurrency()); + const int chunkSize = std::min(nSeeds, std::clamp(nSeeds / (constants::NumberOfConcurrentSeeds * maxConcurrency), constants::MinNumberOfConcurrentSeeds, constants::MaxNumberOfConcurrentSeeds)); // acts as memory bound and minimum work + + // flush local track vector to global vector on reaching chunkSize std::mutex tracksMutex; - tbb::parallel_for(0, nWorkers, [&](const int) { + auto flushTracks = [&](bounded_vector& localTracks) { + if (localTracks.empty()) { + return; + } + std::lock_guard lock{tracksMutex}; + tracks.insert(tracks.end(), std::make_move_iterator(localTracks.begin()), std::make_move_iterator(localTracks.end())); + localTracks.clear(); + }; + + // each worker works on its own range + tbb::parallel_for(tbb::blocked_range(0, nSeeds, chunkSize), [&](const auto& range) { bounded_vector localTracks(mMemoryPool.get()); - localTracks.reserve(chunkSize); - while (true) { - const int firstSeed = nextSeed.fetch_add(chunkSize, std::memory_order_relaxed); - if (firstSeed >= nSeeds) { - break; - } - const int lastSeed = std::min(firstSeed + chunkSize, nSeeds); - for (int iSeed{firstSeed}; iSeed < lastSeed; ++iSeed) { - TrackITSExt temporaryTrack; - if (finaliseTrackSeed(trackSeeds[iSeed], temporaryTrack, iteration, tfInfos, unsortedClusters, propagator)) { - localTracks.push_back(temporaryTrack); - } + localTracks.reserve(std::min(chunkSize, static_cast(range.size()))); + for (int iSeed{range.begin()}; iSeed < range.end(); ++iSeed) { + TrackITSExt temporaryTrack; + if (finaliseTrackSeed(trackSeeds[iSeed], temporaryTrack, iteration, tfInfos, unsortedClusters, propagator)) { + localTracks.push_back(temporaryTrack); } - if (!localTracks.empty()) { - std::lock_guard lock{tracksMutex}; - tracks.insert(tracks.end(), std::make_move_iterator(localTracks.begin()), std::make_move_iterator(localTracks.end())); - localTracks.clear(); + if (static_cast(localTracks.size()) == chunkSize) { + flushTracks(localTracks); } } + flushTracks(localTracks); // flush remaining deepVectorClear(localTracks); }); From 05ede9074d12b3aa328da15cb24c5099e55d8ab7 Mon Sep 17 00:00:00 2001 From: Felix Schlepper Date: Mon, 15 Jun 2026 15:54:46 +0200 Subject: [PATCH 2/2] ITS: avoid memory shuffeling by accessing tracks via idx Signed-off-by: Felix Schlepper --- .../tracking/GPU/ITStrackingGPU/TimeFrameGPU.h | 5 +++++ .../GPU/ITStrackingGPU/TrackingKernels.h | 1 + .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu | 11 +++++++++++ .../ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx | 5 ++++- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 17 ++++++++++++----- .../include/ITStracking/TrackerTraits.h | 2 +- .../ITSMFT/ITS/tracking/src/TrackerTraits.cxx | 13 +++++++++---- 7 files changed, 43 insertions(+), 11 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index 00d8dfeba2312..38de1e712108d 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -93,6 +93,7 @@ class TimeFrameGPU : public TimeFrame void createTrackITSExtDevice(const size_t); void createTrackExtensionScratchDevice(const int nThreads, const int maxHypotheses); void downloadTrackITSExtDevice(); + void downloadTrackIndicesDevice(); void downloadCellsNeighboursDevice(std::vector>&, const int); void downloadNeighboursLUTDevice(bounded_vector&, const int); void downloadCellsDevice(); @@ -120,6 +121,7 @@ class TimeFrameGPU : public TimeFrame const auto getDeviceTrackingTopologyView() const { return mDeviceTrackingTopologyView; } int* getDeviceROFramesClusters(const int layer) { return mROFramesClustersDevice[layer]; } auto& getTrackITSExt() { return mTrackITSExt; } + auto& getTrackIndices() { return mTrackIndices; } Vertex* getDeviceVertices() { return mPrimaryVerticesDevice; } int* getDeviceROFramesPV() { return mROFramesPVDevice; } unsigned char* getDeviceUsedClusters(const int); @@ -127,6 +129,7 @@ class TimeFrameGPU : public TimeFrame // Hybrid TrackITSExt* getDeviceTrackITSExt() { return mTrackITSExtDevice; } + int* getDeviceTrackIndices() { return mTrackIndicesDevice; } TrackExtensionHypothesis* getDeviceActiveTrackExtensionHypotheses() { return mActiveTrackExtensionHypothesesDevice; } TrackExtensionHypothesis* getDeviceNextTrackExtensionHypotheses() { return mNextTrackExtensionHypothesesDevice; } int* getDeviceNeighboursLUT(const int layer) { return mNeighboursLUTDevice[layer]; } @@ -226,6 +229,7 @@ class TimeFrameGPU : public TimeFrame float** mCellSeedsChi2DeviceArray; TrackITSExt* mTrackITSExtDevice; + int* mTrackIndicesDevice{nullptr}; TrackExtensionHypothesis* mActiveTrackExtensionHypothesesDevice{nullptr}; TrackExtensionHypothesis* mNextTrackExtensionHypothesesDevice{nullptr}; std::array mNeighboursDevice{}; @@ -244,6 +248,7 @@ class TimeFrameGPU : public TimeFrame // Temporary buffer for storing output tracks from GPU tracking bounded_vector mTrackITSExt; + bounded_vector mTrackIndices; }; template diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index f4a89d9d24d8f..34ac3e564e26d 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -232,6 +232,7 @@ void computeTrackSeedHandler(TrackSeed* trackSeeds, const int** clustersIndexTables, const int** ROFClusters, o2::its::TrackITSExt* tracks, + int* trackIndices, const int* seedLUT, TrackExtensionHypothesis* activeHypotheses, TrackExtensionHypothesis* nextHypotheses, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index 614ff53786ebf..b023c91e5063c 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -578,8 +578,11 @@ void TimeFrameGPU::createTrackITSExtDevice(const size_t nSeeds) GPUChkErrS(cudaMemcpy(&mNTracks, mTrackSeedsLUTDevice + nSeeds, sizeof(int), cudaMemcpyDeviceToHost)); GPULog("gpu-allocation: reserving {} tracks, for {:.2f} MB.", mNTracks, mNTracks * sizeof(o2::its::TrackITSExt) / constants::MB); mTrackITSExt = bounded_vector(mNTracks, {}, this->getMemoryPool().get()); + mTrackIndices = bounded_vector(mNTracks, 0, this->getMemoryPool().get()); allocMem(reinterpret_cast(&mTrackITSExtDevice), mNTracks * sizeof(o2::its::TrackITSExt), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK)); GPUChkErrS(cudaMemset(mTrackITSExtDevice, 0, mNTracks * sizeof(o2::its::TrackITSExt))); + GPULog("gpu-allocation: reserving {} track indices, for {:.2f} MB.", mNTracks, mNTracks * sizeof(int) / constants::MB); + allocMem(reinterpret_cast(&mTrackIndicesDevice), mNTracks * sizeof(int), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK)); } template @@ -643,6 +646,14 @@ void TimeFrameGPU::downloadTrackITSExtDevice() GPUChkErrS(cudaMemcpy(mTrackITSExt.data(), mTrackITSExtDevice, mTrackITSExt.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost)); } +template +void TimeFrameGPU::downloadTrackIndicesDevice() +{ + GPUTimer timer("downloading track indices"); + GPULog("gpu-transfer: downloading {} track indices, for {:.2f} MB.", mTrackIndices.size(), mTrackIndices.size() * sizeof(int) / constants::MB); + GPUChkErrS(cudaMemcpy(mTrackIndices.data(), mTrackIndicesDevice, mTrackIndices.size() * sizeof(int), cudaMemcpyDeviceToHost)); +} + template void TimeFrameGPU::unregisterHostMemory(const int maxLayers) { diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index 47d0ae0e34801..b4a00e357d0d4 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -380,6 +380,7 @@ void TrackerTraitsGPU::findRoads(const int iteration) mTimeFrameGPU->getDeviceArrayClustersIndexTables(), mTimeFrameGPU->getDeviceROFrameClusters(), mTimeFrameGPU->getDeviceTrackITSExt(), + mTimeFrameGPU->getDeviceTrackIndices(), mTimeFrameGPU->getDeviceTrackSeedsLUT(), extendTracks ? mTimeFrameGPU->getDeviceActiveTrackExtensionHypotheses() : nullptr, extendTracks ? mTimeFrameGPU->getDeviceNextTrackExtensionHypotheses() : nullptr, @@ -405,9 +406,11 @@ void TrackerTraitsGPU::findRoads(const int iteration) this->mTrkParams[iteration].CorrType, mTimeFrameGPU->getFrameworkAllocator()); mTimeFrameGPU->downloadTrackITSExtDevice(); + mTimeFrameGPU->downloadTrackIndicesDevice(); auto& tracks = mTimeFrameGPU->getTrackITSExt(); - this->acceptTracks(iteration, tracks, firstClusters); + const auto& trackIndices = mTimeFrameGPU->getTrackIndices(); + this->acceptTracks(iteration, tracks, trackIndices, firstClusters); mTimeFrameGPU->loadUsedClustersDevice(); } this->markTracks(iteration); diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 6f4f2bca76722..5d1757259d015 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -86,10 +87,12 @@ struct is_valid_pair { } }; -struct compare_track_chi2 { - GPUhd() bool operator()(const TrackITSExt& a, const TrackITSExt& b) const +struct compare_track_index_chi2 { + const TrackITSExt* tracks; + + GPUhd() bool operator()(const int a, const int b) const { - return o2::its::track::isBetter(a, b); + return o2::its::track::isBetter(tracks[a], tracks[b]); } }; @@ -1159,6 +1162,7 @@ void computeTrackSeedHandler(TrackSeed* trackSeeds, const int** clustersIndexTables, const int** ROFClusters, o2::its::TrackITSExt* tracks, + int* trackIndices, const int* seedLUT, TrackExtensionHypothesis* activeHypotheses, TrackExtensionHypothesis* nextHypotheses, @@ -1222,8 +1226,9 @@ void computeTrackSeedHandler(TrackSeed* trackSeeds, propagator, // const o2::base::Propagator* matCorrType); // o2::base::PropagatorF::MatCorrType auto sync_policy = THRUST_NAMESPACE::par(gpu::TypedAllocator(alloc)); - thrust::device_ptr tr_ptr(tracks); - thrust::sort(sync_policy, tr_ptr, tr_ptr + nTracks, gpu::compare_track_chi2()); + thrust::device_ptr trackIndicesPtr(trackIndices); + thrust::sequence(sync_policy, trackIndicesPtr, trackIndicesPtr + nTracks); + thrust::sort(sync_policy, trackIndicesPtr, trackIndicesPtr + nTracks, gpu::compare_track_index_chi2{tracks}); } /// Explicit instantiation of ITS2 handlers @@ -1401,6 +1406,7 @@ template void computeTrackSeedHandler(TrackSeed<7>* trackSeeds, const int** clustersIndexTables, const int** ROFClusters, o2::its::TrackITSExt* tracks, + int* trackIndices, const int* seedLUT, TrackExtensionHypothesis<7>* activeHypotheses, TrackExtensionHypothesis<7>* nextHypotheses, @@ -1602,6 +1608,7 @@ template void computeTrackSeedHandler(TrackSeed<11>* trackSeeds, const int** clustersIndexTables, const int** ROFClusters, o2::its::TrackITSExt* tracks, + int* trackIndices, const int* seedLUT, TrackExtensionHypothesis<11>* activeHypotheses, TrackExtensionHypothesis<11>* nextHypotheses, diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackerTraits.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackerTraits.h index aaad48731cc2a..4d6378aded0e8 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackerTraits.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackerTraits.h @@ -59,7 +59,7 @@ class TrackerTraits template void processNeighbours(int iteration, int defaultCellTopologyId, int iLevel, const bounded_vector& currentCellSeed, const bounded_vector& currentCellId, const bounded_vector& currentCellTopologyId, bounded_vector& updatedCellSeed, bounded_vector& updatedCellId, bounded_vector& updatedCellTopologyId); - void acceptTracks(int iteration, bounded_vector& tracks, bounded_vector>& firstClusters); + void acceptTracks(int iteration, bounded_vector& tracks, const bounded_vector& trackIndices, bounded_vector>& firstClusters); void markTracks(int iteration); void updateTrackingParameters(const std::vector& trkPars) diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx index b3043fcc3c010..d80a6d17c8135 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx @@ -846,11 +846,14 @@ void TrackerTraits::findRoads(const int iteration) deepVectorClear(trackSeeds); }); - std::sort(tracks.begin(), tracks.end(), [](const auto& a, const auto& b) { - return track::isBetter(a, b); + // Sort tracks via indices to avoid moving TrackITSExt objects. + bounded_vector trackIndices(tracks.size(), mMemoryPool.get()); + std::iota(trackIndices.begin(), trackIndices.end(), 0); + std::sort(trackIndices.begin(), trackIndices.end(), [&tracks](int a, int b) { + return track::isBetter(tracks[a], tracks[b]); }); - acceptTracks(iteration, tracks, firstClusters); + acceptTracks(iteration, tracks, trackIndices, firstClusters); } markTracks(iteration); } @@ -858,12 +861,14 @@ void TrackerTraits::findRoads(const int iteration) template void TrackerTraits::acceptTracks(int iteration, bounded_vector& tracks, + const bounded_vector& trackIndices, bounded_vector>& firstClusters) { auto& trks = mTimeFrame->getTracks(); trks.reserve(trks.size() + tracks.size()); const float smallestROFHalf = mTimeFrame->getROFOverlapTableView().getClockLayer().mROFLength * 0.5f; - for (auto& track : tracks) { + for (size_t trackId{0}; trackId < trackIndices.size(); ++trackId) { + auto& track = tracks[trackIndices[trackId]]; int nShared = 0; bool isFirstShared{false}; int firstLayer{-1}, firstCluster{-1};