/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file //! Operations for reading linear tiles of data into the CUDA warp. #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include CUB_NAMESPACE_BEGIN //! @rst //! ``cub::WarpLoadAlgorithm`` enumerates alternative algorithms for :cpp:struct:`cub::WarpLoad` to //! read a linear segment of data from memory into a CUDA warp. //! @endrst enum WarpLoadAlgorithm { //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! A :ref:`blocked arrangement ` of data is read directly from memory. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! The utilization of memory transactions (coalescing) decreases as the //! access stride between threads increases (i.e., the number items per thread). //! @endrst WARP_LOAD_DIRECT, //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! A :ref:`striped arrangement ` of data is read directly from memory. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! The utilization of memory transactions (coalescing) doesn't depend on //! the number of items per thread. //! @endrst WARP_LOAD_STRIPED, //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! A :ref:`blocked arrangement ` of data is read from memory using //! CUDA's built-in vectorized loads as a coalescing optimization. //! For example, ``ld.global.v4.s32`` instructions will be generated when ``T = int`` and //! ``ITEMS_PER_THREAD % 4 == 0``. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! - The utilization of memory transactions (coalescing) remains high until the the //! access stride between threads (i.e., the number items per thread) exceeds the //! maximum vector load width (typically 4 items or 64B, whichever is lower). //! - The following conditions will prevent vectorization and loading will fall //! back to cub::WARP_LOAD_DIRECT: //! //! - ``ITEMS_PER_THREAD`` is odd //! - The ``InputIteratorT`` is not a simple pointer type //! - The block input offset is not quadword-aligned //! - The data type ``T`` is not a built-in primitive or CUDA vector type //! (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.) //! @endrst WARP_LOAD_VECTORIZE, //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! A :ref:`striped arrangement ` of data is read efficiently from //! memory and then locally transposed into a //! :ref:`blocked arrangement `. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! - The utilization of memory transactions (coalescing) remains high //! regardless of items loaded per thread. //! - The local reordering incurs slightly longer latencies and throughput than the direct //! ``cub::WARP_LOAD_DIRECT`` and ``cub::WARP_LOAD_VECTORIZE`` alternatives. //! @endrst WARP_LOAD_TRANSPOSE }; //! @rst //! The WarpLoad class provides :ref:`collective ` data movement methods for //! loading a linear segment of items from memory into a //! :ref:`blocked arrangement ` across a CUDA thread warp. //! //! Overview //! ++++++++++++++++ //! //! - The WarpLoad class provides a single data movement abstraction that can be //! specialized to implement different cub::WarpLoadAlgorithm strategies. This //! facilitates different performance policies for different architectures, data //! types, granularity sizes, etc. //! - WarpLoad can be optionally specialized by different data movement strategies: //! //! #. :cpp:enumerator:`cub::WARP_LOAD_DIRECT`: //! a :ref:`blocked arrangement ` of data is read directly from //! memory. //! #. :cpp:enumerator:`cub::WARP_LOAD_STRIPED`: //! a :ref:`striped arrangement ` of data is read directly from //! memory. //! #. :cpp:enumerator:`cub::WARP_LOAD_VECTORIZE`: //! a :ref:`blocked arrangement ` of data is read directly from //! memory using CUDA's built-in vectorized loads as a coalescing optimization. //! #. :cpp:enumerator:`cub::WARP_LOAD_TRANSPOSE`: //! a :ref:`striped arrangement ` of data is read directly from //! memory and is then locally transposed into a //! :ref:`blocked arrangement `. //! //! A Simple Example //! ++++++++++++++++ //! //! The code snippet below illustrates the loading of a linear segment of 64 //! integers into a "blocked" arrangement across 16 threads where each thread //! owns 4 consecutive items. The load is specialized for ``WARP_LOAD_TRANSPOSE``, //! meaning memory references are efficiently coalesced using a warp-striped access //! pattern (after which items are locally reordered among threads). //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(int *d_data, ...) //! { //! constexpr int warp_threads = 16; //! constexpr int block_threads = 256; //! constexpr int items_per_thread = 4; //! //! // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each //! using WarpLoadT = WarpLoad; //! //! constexpr int warps_in_block = block_threads / warp_threads; //! constexpr int tile_size = items_per_thread * warp_threads; //! const int warp_id = static_cast(threadIdx.x) / warp_threads; //! //! // Allocate shared memory for WarpLoad //! __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block]; //! //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[items_per_thread]; //! WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, //! thread_data); //! //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. //! The set of ``thread_data`` across the first logical warp of threads in those //! threads will be: ``{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }``. //! @endrst //! //! @tparam InputT //! The data type to read into (which must be convertible from the input //! iterator's value type). //! //! @tparam ITEMS_PER_THREAD //! The number of consecutive items partitioned onto each thread. //! //! @tparam ALGORITHM //! [optional] cub::WarpLoadAlgorithm tuning policy. //! default: cub::WARP_LOAD_DIRECT. //! //! @tparam LOGICAL_WARP_THREADS //! [optional] The number of threads per "logical" warp (may be less //! than the number of hardware warp threads). Default is the warp size of the //! targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a //! power of two. //! //! @tparam LEGACY_PTX_ARCH //! Unused. template class WarpLoad { static constexpr bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0); static_assert(PowerOfTwo::VALUE, "LOGICAL_WARP_THREADS must be a power of two"); private: /***************************************************************************** * Algorithmic variants ****************************************************************************/ /// Load helper template struct LoadInternal; template struct LoadInternal { using TempStorage = NullType; int linear_tid; _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { LoadDirectBlocked(linear_tid, block_itr, items); } template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) { LoadDirectBlocked(linear_tid, block_itr, items, valid_items); } template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) { LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); } }; template struct LoadInternal { using TempStorage = NullType; int linear_tid; _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { LoadDirectStriped(linear_tid, block_itr, items); } template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) { LoadDirectStriped(linear_tid, block_itr, items, valid_items); } template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) { LoadDirectStriped(linear_tid, block_itr, items, valid_items, oob_default); } }; template struct LoadInternal { using TempStorage = NullType; int linear_tid; _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputT* block_ptr, InputT (&items)[ITEMS_PER_THREAD]) { InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); } template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(const InputT* block_ptr, InputT (&items)[ITEMS_PER_THREAD]) { InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); } template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(CacheModifiedInputIterator block_itr, InputT (&items)[ITEMS_PER_THREAD]) { InternalLoadDirectBlockedVectorized(linear_tid, block_itr.ptr, items); } template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(_InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { LoadDirectBlocked(linear_tid, block_itr, items); } template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) { LoadDirectBlocked(linear_tid, block_itr, items, valid_items); } template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) { LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); } }; template struct LoadInternal { using WarpExchangeT = WarpExchange; struct _TempStorage : WarpExchangeT::TempStorage {}; struct TempStorage : Uninitialized<_TempStorage> {}; _TempStorage& temp_storage; int linear_tid; _CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& temp_storage, int linear_tid) : temp_storage(temp_storage.Alias()) , linear_tid(linear_tid) {} template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { LoadDirectStriped(linear_tid, block_itr, items); WarpExchangeT(temp_storage).StripedToBlocked(items, items); } template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) { LoadDirectStriped(linear_tid, block_itr, items, valid_items); WarpExchangeT(temp_storage).StripedToBlocked(items, items); } template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) { LoadDirectStriped(linear_tid, block_itr, items, valid_items, oob_default); WarpExchangeT(temp_storage).StripedToBlocked(items, items); } }; /***************************************************************************** * Type definitions ****************************************************************************/ /// Internal load implementation to use using InternalLoad = LoadInternal; /// Shared memory storage layout type using _TempStorage = typename InternalLoad::TempStorage; /***************************************************************************** * Utility methods ****************************************************************************/ /// Internal storage allocator _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } /***************************************************************************** * Thread fields ****************************************************************************/ /// Thread reference to shared storage _TempStorage& temp_storage; /// Linear thread-id int linear_tid; public: /// @smemstorage{WarpLoad} struct TempStorage : Uninitialized<_TempStorage> {}; //! @name Collective constructors //! @{ //! @brief Collective constructor using a private static allocation of //! shared memory as temporary storage. _CCCL_DEVICE _CCCL_FORCEINLINE WarpLoad() : temp_storage(PrivateStorage()) , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) {} //! @brief Collective constructor using the specified memory allocation as //! temporary storage. _CCCL_DEVICE _CCCL_FORCEINLINE WarpLoad(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) {} //! @} end member group //! @name Data movement //! @{ //! @rst //! Load a linear segment of items from memory. //! //! @smemwarpreuse //! //! Snippet //! +++++++ //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(int *d_data, ...) //! { //! constexpr int warp_threads = 16; //! constexpr int block_threads = 256; //! constexpr int items_per_thread = 4; //! //! // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each //! using WarpLoadT = WarpLoad; //! //! constexpr int warps_in_block = block_threads / warp_threads; //! constexpr int tile_size = items_per_thread * warp_threads; //! const int warp_id = static_cast(threadIdx.x) / warp_threads; //! //! // Allocate shared memory for WarpLoad //! __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block]; //! //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[items_per_thread]; //! WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, //! thread_data); //! //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``, //! The set of ``thread_data`` across the first logical warp of threads in those //! threads will be: ``{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }``. //! @endrst //! //! @param[in] block_itr The thread block's base input iterator for loading from //! @param[out] items Data to load template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD]) { InternalLoad(temp_storage, linear_tid).Load(block_itr, items); } //! @rst //! Load a linear segment of items from memory, guarded by range. //! //! @smemwarpreuse //! //! Snippet //! +++++++ //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(int *d_data, int valid_items, ...) //! { //! constexpr int warp_threads = 16; //! constexpr int block_threads = 256; //! constexpr int items_per_thread = 4; //! //! // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each //! using WarpLoadT = WarpLoad; //! //! constexpr int warps_in_block = block_threads / warp_threads; //! constexpr int tile_size = items_per_thread * warp_threads; //! const int warp_id = static_cast(threadIdx.x) / warp_threads; //! //! // Allocate shared memory for WarpLoad //! __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block]; //! //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[items_per_thread]; //! WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, //! thread_data, //! valid_items); //! //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...`` and ``valid_items`` is ``5``. //! The set of ``thread_data`` across the first logical warp of threads in those threads will be: //! ``{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }`` with only the first two threads being unmasked to //! load portions of valid data (and other items remaining unassigned). //! @endrst //! //! @param[in] block_itr The thread block's base input iterator for loading from //! @param[out] items Data to load //! @param[in] valid_items Number of valid items to load template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items) { InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items); } //! @rst //! Load a linear segment of items from memory, guarded by range. //! //! @smemwarpreuse //! //! Snippet //! +++++++ //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(int *d_data, int valid_items, ...) //! { //! constexpr int warp_threads = 16; //! constexpr int block_threads = 256; //! constexpr int items_per_thread = 4; //! //! // Specialize WarpLoad for a warp of 16 threads owning 4 integer items each //! using WarpLoadT = WarpLoad; //! //! constexpr int warps_in_block = block_threads / warp_threads; //! constexpr int tile_size = items_per_thread * warp_threads; //! const int warp_id = static_cast(threadIdx.x) / warp_threads; //! //! // Allocate shared memory for WarpLoad //! __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block]; //! //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[items_per_thread]; //! WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, //! thread_data, //! valid_items, //! -1); //! //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``, ``valid_items`` is ``5``, and the //! out-of-bounds default is ``-1``. The set of ``thread_data`` across the first logical warp of //! threads in those threads will be: ``{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }`` with //! only the first two threads being unmasked to load portions of valid data (and other items //! are assigned ``-1``). //! @endrst //! //! @param[in] block_itr The thread block's base input iterator for loading from //! @param[out] items Data to load //! @param[in] valid_items Number of valid items to load //! @param[in] oob_default Default value to assign out-of-bound items template _CCCL_DEVICE _CCCL_FORCEINLINE void Load(InputIteratorT block_itr, InputT (&items)[ITEMS_PER_THREAD], int valid_items, DefaultT oob_default) { InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default); } //! @} end member group }; CUB_NAMESPACE_END