/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * The cub::WarpExchange class provides [collective](../index.html#sec0) * methods for rearranging data partitioned across a CUDA warp. */ #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include CUB_NAMESPACE_BEGIN enum WarpExchangeAlgorithm { WARP_EXCHANGE_SMEM, WARP_EXCHANGE_SHUFFLE, }; namespace detail { template using InternalWarpExchangeImpl = ::cuda::std::_If, WarpExchangeShfl>; } // namespace detail /** * @brief The WarpExchange class provides [collective](../index.html#sec0) * methods for rearranging data partitioned across a CUDA warp. * * @tparam T * The data type to be exchanged. * * @tparam ITEMS_PER_THREAD * The number of items partitioned onto each thread. * * @tparam LOGICAL_WARP_THREADS * [optional] The number of threads per "logical" warp (may be less * than the number of hardware warp threads). Default is the warp size of the * targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a * power of two. * * @tparam LEGACY_PTX_ARCH * Unused. * * @par Overview * - It is commonplace for a warp of threads to rearrange data items between * threads. For example, the global memory accesses prefer patterns where * data items are "striped" across threads (where consecutive threads access * consecutive items), yet most warp-wide operations prefer a "blocked" * partitioning of items across threads (where consecutive items belong to a * single thread). * - WarpExchange supports the following types of data exchanges: * - Transposing between [blocked](../index.html#sec5sec3) and * [striped](../index.html#sec5sec3) arrangements * - Scattering ranked items to a * [striped arrangement](../index.html#sec5sec3) * * @par A Simple Example * @par * The code snippet below illustrates the conversion from a "blocked" to a * "striped" arrangement of 64 integer items partitioned across 16 threads where * each thread owns 4 items. * @par * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * constexpr int warps_per_block = block_threads / warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each * using WarpExchangeT = * cub::WarpExchange; * * // Allocate shared memory for WarpExchange * __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block]; * * // Load a tile of data striped across threads * int thread_data[items_per_thread]; * // ... * * // Collectively exchange data into a blocked arrangement across threads * WarpExchangeT(temp_storage[warp_id]).StripedToBlocked(thread_data, thread_data); * @endcode * @par * Suppose the set of striped input @p thread_data across the block of threads * is { [0,16,32,48], [1,17,33,49], ..., [15, 32, 47, 63] }. * The corresponding output @p thread_data in those threads will be * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [60,61,62,63] }. */ template class WarpExchange : private detail::InternalWarpExchangeImpl { using InternalWarpExchange = detail::InternalWarpExchangeImpl; public: /// \smemstorage{WarpExchange} using TempStorage = typename InternalWarpExchange::TempStorage; //! @name Collective constructors //! @{ WarpExchange() = delete; /** * @brief Collective constructor using the specified memory allocation as * temporary storage. */ explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpExchange(TempStorage& temp_storage) : InternalWarpExchange(temp_storage) {} //! @} end member group //! @name Data movement //! @{ /** * @brief Transposes data items from blocked arrangement to * striped arrangement. * * @par * @smemwarpreuse * * @par Snippet * The code snippet below illustrates the conversion from a "blocked" to a * "striped" arrangement of 64 integer items partitioned across 16 threads * where each thread owns 4 items. * @par * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * constexpr int warps_per_block = block_threads / warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each * using WarpExchangeT = cub::WarpExchange; * * // Allocate shared memory for WarpExchange * __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block]; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[items_per_thread]; * // ... * * // Collectively exchange data into a striped arrangement across threads * WarpExchangeT(temp_storage[warp_id]).BlockedToStriped(thread_data, thread_data); * @endcode * @par * Suppose the set of striped input @p thread_data across the block of threads * is { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [60,61,62,63] }. * The corresponding output @p thread_data in those threads will be * { [0,16,32,48], [1,17,33,49], ..., [15, 32, 47, 63] }. * * @param[in] input_items * Items to exchange, converting between blocked and * striped arrangements. * * @param[out] output_items * Items from exchange, converting between striped and * blocked arrangements. May be aliased to @p input_items. */ template _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(const InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) { InternalWarpExchange::BlockedToStriped(input_items, output_items); } /** * @brief Transposes data items from striped arrangement to * blocked arrangement. * * @par * @smemwarpreuse * * @par Snippet * The code snippet below illustrates the conversion from a "striped" to a * "blocked" arrangement of 64 integer items partitioned across 16 threads * where each thread owns 4 items. * @par * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * constexpr int warps_per_block = block_threads / warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each * using WarpExchangeT = cub::WarpExchange; * * // Allocate shared memory for WarpExchange * __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block]; * * // Load a tile of data striped across threads * int thread_data[items_per_thread]; * // ... * * // Collectively exchange data into a blocked arrangement across threads * WarpExchangeT(temp_storage[warp_id]).StripedToBlocked(thread_data, thread_data); * @endcode * @par * Suppose the set of striped input @p thread_data across the block of threads * is { [0,16,32,48], [1,17,33,49], ..., [15, 32, 47, 63] }. * The corresponding output @p thread_data in those threads will be * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [60,61,62,63] }. * * @param[in] input_items * Items to exchange * * @param[out] output_items * Items from exchange. May be aliased to @p input_items. */ template _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(const InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) { InternalWarpExchange::StripedToBlocked(input_items, output_items); } /** * @brief Exchanges valid data items annotated by rank * into striped arrangement. * * @par * @smemwarpreuse * * @par Snippet * The code snippet below illustrates the conversion from a "scatter" to a * "striped" arrangement of 64 integer items partitioned across 16 threads * where each thread owns 4 items. * @par * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * constexpr int warps_per_block = block_threads / warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each * using WarpExchangeT = cub::WarpExchange; * * // Allocate shared memory for WarpExchange * __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block]; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[items_per_thread]; * int thread_ranks[items_per_thread]; * // ... * * // Collectively exchange data into a striped arrangement across threads * WarpExchangeT(temp_storage[warp_id]).ScatterToStriped( * thread_data, thread_ranks); * @endcode * @par * Suppose the set of input @p thread_data across the block of threads * is `{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }`, and the set of * @p thread_ranks is `{ [63,62,61,60], ..., [7,6,5,4], [3,2,1,0] }`. The * corresponding output @p thread_data in those threads will be * `{ [63, 47, 31, 15], [62, 46, 30, 14], ..., [48, 32, 16, 0] }`. * * @tparam OffsetT [inferred] Signed integer type for local offsets * * @param[in,out] items Items to exchange * @param[in] ranks Corresponding scatter ranks */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { InternalWarpExchange::ScatterToStriped(items, ranks); } /** * @brief Exchanges valid data items annotated by rank * into striped arrangement. * * @par * @smemwarpreuse * * @par Snippet * The code snippet below illustrates the conversion from a "scatter" to a * "striped" arrangement of 64 integer items partitioned across 16 threads * where each thread owns 4 items. * @par * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * constexpr int warps_per_block = block_threads / warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each * using WarpExchangeT = cub::WarpExchange; * * // Allocate shared memory for WarpExchange * __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block]; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_input[items_per_thread]; * int thread_ranks[items_per_thread]; * // ... * * // Collectively exchange data into a striped arrangement across threads * int thread_output[items_per_thread]; * WarpExchangeT(temp_storage[warp_id]).ScatterToStriped( * thread_input, thread_output, thread_ranks); * @endcode * @par * Suppose the set of input @p thread_input across the block of threads * is `{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }`, and the set of * @p thread_ranks is `{ [63,62,61,60], ..., [7,6,5,4], [3,2,1,0] }`. The * corresponding @p thread_output in those threads will be * `{ [63, 47, 31, 15], [62, 46, 30, 14], ..., [48, 32, 16, 0] }`. * * @tparam OffsetT [inferred] Signed integer type for local offsets * * @param[in] input_items * Items to exchange * * @param[out] output_items * Items from exchange. May be aliased to @p input_items. * * @param[in] ranks * Corresponding scatter ranks */ template _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped( const InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { InternalWarpExchange::ScatterToStriped(input_items, output_items, ranks); } //@} end member group }; CUB_NAMESPACE_END