/******************************************************************************
* Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* @file
* The cub::WarpExchange class provides [collective](../index.html#sec0)
* methods for rearranging data partitioned across a CUDA warp.
*/
#pragma once
#include
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header
#include
#include
#include
#include
#include
CUB_NAMESPACE_BEGIN
enum WarpExchangeAlgorithm
{
WARP_EXCHANGE_SMEM,
WARP_EXCHANGE_SHUFFLE,
};
namespace detail
{
template
using InternalWarpExchangeImpl =
::cuda::std::_If,
WarpExchangeShfl>;
} // namespace detail
/**
* @brief The WarpExchange class provides [collective](../index.html#sec0)
* methods for rearranging data partitioned across a CUDA warp.
*
* @tparam T
* The data type to be exchanged.
*
* @tparam ITEMS_PER_THREAD
* The number of items partitioned onto each thread.
*
* @tparam LOGICAL_WARP_THREADS
* [optional] The number of threads per "logical" warp (may be less
* than the number of hardware warp threads). Default is the warp size of the
* targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a
* power of two.
*
* @tparam LEGACY_PTX_ARCH
* Unused.
*
* @par Overview
* - It is commonplace for a warp of threads to rearrange data items between
* threads. For example, the global memory accesses prefer patterns where
* data items are "striped" across threads (where consecutive threads access
* consecutive items), yet most warp-wide operations prefer a "blocked"
* partitioning of items across threads (where consecutive items belong to a
* single thread).
* - WarpExchange supports the following types of data exchanges:
* - Transposing between [blocked](../index.html#sec5sec3) and
* [striped](../index.html#sec5sec3) arrangements
* - Scattering ranked items to a
* [striped arrangement](../index.html#sec5sec3)
*
* @par A Simple Example
* @par
* The code snippet below illustrates the conversion from a "blocked" to a
* "striped" arrangement of 64 integer items partitioned across 16 threads where
* each thread owns 4 items.
* @par
* @code
* #include // or equivalently
*
* __global__ void ExampleKernel(int *d_data, ...)
* {
* constexpr int warp_threads = 16;
* constexpr int block_threads = 256;
* constexpr int items_per_thread = 4;
* constexpr int warps_per_block = block_threads / warp_threads;
* const int warp_id = static_cast(threadIdx.x) / warp_threads;
*
* // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each
* using WarpExchangeT =
* cub::WarpExchange;
*
* // Allocate shared memory for WarpExchange
* __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block];
*
* // Load a tile of data striped across threads
* int thread_data[items_per_thread];
* // ...
*
* // Collectively exchange data into a blocked arrangement across threads
* WarpExchangeT(temp_storage[warp_id]).StripedToBlocked(thread_data, thread_data);
* @endcode
* @par
* Suppose the set of striped input @p thread_data across the block of threads
* is { [0,16,32,48], [1,17,33,49], ..., [15, 32, 47, 63] }.
* The corresponding output @p thread_data in those threads will be
* { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [60,61,62,63] }.
*/
template
class WarpExchange
: private detail::InternalWarpExchangeImpl
{
using InternalWarpExchange =
detail::InternalWarpExchangeImpl;
public:
/// \smemstorage{WarpExchange}
using TempStorage = typename InternalWarpExchange::TempStorage;
//! @name Collective constructors
//! @{
WarpExchange() = delete;
/**
* @brief Collective constructor using the specified memory allocation as
* temporary storage.
*/
explicit _CCCL_DEVICE _CCCL_FORCEINLINE WarpExchange(TempStorage& temp_storage)
: InternalWarpExchange(temp_storage)
{}
//! @} end member group
//! @name Data movement
//! @{
/**
* @brief Transposes data items from blocked arrangement to
* striped arrangement.
*
* @par
* @smemwarpreuse
*
* @par Snippet
* The code snippet below illustrates the conversion from a "blocked" to a
* "striped" arrangement of 64 integer items partitioned across 16 threads
* where each thread owns 4 items.
* @par
* @code
* #include // or equivalently
*
* __global__ void ExampleKernel(int *d_data, ...)
* {
* constexpr int warp_threads = 16;
* constexpr int block_threads = 256;
* constexpr int items_per_thread = 4;
* constexpr int warps_per_block = block_threads / warp_threads;
* const int warp_id = static_cast(threadIdx.x) / warp_threads;
*
* // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each
* using WarpExchangeT = cub::WarpExchange;
*
* // Allocate shared memory for WarpExchange
* __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block];
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[items_per_thread];
* // ...
*
* // Collectively exchange data into a striped arrangement across threads
* WarpExchangeT(temp_storage[warp_id]).BlockedToStriped(thread_data, thread_data);
* @endcode
* @par
* Suppose the set of striped input @p thread_data across the block of threads
* is { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [60,61,62,63] }.
* The corresponding output @p thread_data in those threads will be
* { [0,16,32,48], [1,17,33,49], ..., [15, 32, 47, 63] }.
*
* @param[in] input_items
* Items to exchange, converting between blocked and
* striped arrangements.
*
* @param[out] output_items
* Items from exchange, converting between striped and
* blocked arrangements. May be aliased to @p input_items.
*/
template
_CCCL_DEVICE _CCCL_FORCEINLINE void
BlockedToStriped(const InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
{
InternalWarpExchange::BlockedToStriped(input_items, output_items);
}
/**
* @brief Transposes data items from striped arrangement to
* blocked arrangement.
*
* @par
* @smemwarpreuse
*
* @par Snippet
* The code snippet below illustrates the conversion from a "striped" to a
* "blocked" arrangement of 64 integer items partitioned across 16 threads
* where each thread owns 4 items.
* @par
* @code
* #include // or equivalently
*
* __global__ void ExampleKernel(int *d_data, ...)
* {
* constexpr int warp_threads = 16;
* constexpr int block_threads = 256;
* constexpr int items_per_thread = 4;
* constexpr int warps_per_block = block_threads / warp_threads;
* const int warp_id = static_cast(threadIdx.x) / warp_threads;
*
* // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each
* using WarpExchangeT = cub::WarpExchange;
*
* // Allocate shared memory for WarpExchange
* __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block];
*
* // Load a tile of data striped across threads
* int thread_data[items_per_thread];
* // ...
*
* // Collectively exchange data into a blocked arrangement across threads
* WarpExchangeT(temp_storage[warp_id]).StripedToBlocked(thread_data, thread_data);
* @endcode
* @par
* Suppose the set of striped input @p thread_data across the block of threads
* is { [0,16,32,48], [1,17,33,49], ..., [15, 32, 47, 63] }.
* The corresponding output @p thread_data in those threads will be
* { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [60,61,62,63] }.
*
* @param[in] input_items
* Items to exchange
*
* @param[out] output_items
* Items from exchange. May be aliased to @p input_items.
*/
template
_CCCL_DEVICE _CCCL_FORCEINLINE void
StripedToBlocked(const InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
{
InternalWarpExchange::StripedToBlocked(input_items, output_items);
}
/**
* @brief Exchanges valid data items annotated by rank
* into striped arrangement.
*
* @par
* @smemwarpreuse
*
* @par Snippet
* The code snippet below illustrates the conversion from a "scatter" to a
* "striped" arrangement of 64 integer items partitioned across 16 threads
* where each thread owns 4 items.
* @par
* @code
* #include // or equivalently
*
* __global__ void ExampleKernel(int *d_data, ...)
* {
* constexpr int warp_threads = 16;
* constexpr int block_threads = 256;
* constexpr int items_per_thread = 4;
* constexpr int warps_per_block = block_threads / warp_threads;
* const int warp_id = static_cast(threadIdx.x) / warp_threads;
*
* // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each
* using WarpExchangeT = cub::WarpExchange;
*
* // Allocate shared memory for WarpExchange
* __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block];
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_data[items_per_thread];
* int thread_ranks[items_per_thread];
* // ...
*
* // Collectively exchange data into a striped arrangement across threads
* WarpExchangeT(temp_storage[warp_id]).ScatterToStriped(
* thread_data, thread_ranks);
* @endcode
* @par
* Suppose the set of input @p thread_data across the block of threads
* is `{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }`, and the set of
* @p thread_ranks is `{ [63,62,61,60], ..., [7,6,5,4], [3,2,1,0] }`. The
* corresponding output @p thread_data in those threads will be
* `{ [63, 47, 31, 15], [62, 46, 30, 14], ..., [48, 32, 16, 0] }`.
*
* @tparam OffsetT [inferred] Signed integer type for local offsets
*
* @param[in,out] items Items to exchange
* @param[in] ranks Corresponding scatter ranks
*/
template
_CCCL_DEVICE _CCCL_FORCEINLINE void
ScatterToStriped(InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD])
{
InternalWarpExchange::ScatterToStriped(items, ranks);
}
/**
* @brief Exchanges valid data items annotated by rank
* into striped arrangement.
*
* @par
* @smemwarpreuse
*
* @par Snippet
* The code snippet below illustrates the conversion from a "scatter" to a
* "striped" arrangement of 64 integer items partitioned across 16 threads
* where each thread owns 4 items.
* @par
* @code
* #include // or equivalently
*
* __global__ void ExampleKernel(int *d_data, ...)
* {
* constexpr int warp_threads = 16;
* constexpr int block_threads = 256;
* constexpr int items_per_thread = 4;
* constexpr int warps_per_block = block_threads / warp_threads;
* const int warp_id = static_cast(threadIdx.x) / warp_threads;
*
* // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each
* using WarpExchangeT = cub::WarpExchange;
*
* // Allocate shared memory for WarpExchange
* __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block];
*
* // Obtain a segment of consecutive items that are blocked across threads
* int thread_input[items_per_thread];
* int thread_ranks[items_per_thread];
* // ...
*
* // Collectively exchange data into a striped arrangement across threads
* int thread_output[items_per_thread];
* WarpExchangeT(temp_storage[warp_id]).ScatterToStriped(
* thread_input, thread_output, thread_ranks);
* @endcode
* @par
* Suppose the set of input @p thread_input across the block of threads
* is `{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }`, and the set of
* @p thread_ranks is `{ [63,62,61,60], ..., [7,6,5,4], [3,2,1,0] }`. The
* corresponding @p thread_output in those threads will be
* `{ [63, 47, 31, 15], [62, 46, 30, 14], ..., [48, 32, 16, 0] }`.
*
* @tparam OffsetT [inferred] Signed integer type for local offsets
*
* @param[in] input_items
* Items to exchange
*
* @param[out] output_items
* Items from exchange. May be aliased to @p input_items.
*
* @param[in] ranks
* Corresponding scatter ranks
*/
template
_CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(
const InputT (&input_items)[ITEMS_PER_THREAD],
OutputT (&output_items)[ITEMS_PER_THREAD],
OffsetT (&ranks)[ITEMS_PER_THREAD])
{
InternalWarpExchange::ScatterToStriped(input_items, output_items, ranks);
}
//@} end member group
};
CUB_NAMESPACE_END