/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file //! @rst //! The ``cub::WarpReduce`` class provides :ref:`collective ` methods for //! computing a parallel reduction of items partitioned across a CUDA thread warp. //! @endrst #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include CUB_NAMESPACE_BEGIN //! @rst //! The ``WarpReduce`` class provides :ref:`collective ` methods for //! computing a parallel reduction of items partitioned across a CUDA thread warp. //! //! .. image:: ../../img/warp_reduce_logo.png //! :align: center //! //! Overview //! ++++++++++++++++++++++++++ //! //! - A `reduction `__ (or *fold*) //! uses a binary combining operator to compute a single aggregate from a list of input elements. //! - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 //! threads) //! - The number of entrant threads must be an multiple of ``LOGICAL_WARP_THREADS`` //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! - Uses special instructions when applicable (e.g., warp ``SHFL`` instructions) //! - Uses synchronization-free communication between warp lanes when applicable //! - Incurs zero bank conflicts for most types //! - Computation is slightly more efficient (i.e., having lower instruction overhead) for: //! //! - Summation (**vs.** generic reduction) //! - The architecture's warp size is a whole multiple of ``LOGICAL_WARP_THREADS`` //! //! Simple Examples //! ++++++++++++++++++++++++++ //! //! @warpcollective{WarpReduce} //! //! The code snippet below illustrates four concurrent warp sum reductions within a block of //! 128 threads (one per each of the 32-thread warps). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpReduce for type int //! using WarpReduce = cub::WarpReduce; //! //! // Allocate WarpReduce shared memory for 4 warps //! __shared__ typename WarpReduce::TempStorage temp_storage[4]; //! //! // Obtain one input item per thread //! int thread_data = ... //! //! // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96) //! int warp_id = threadIdx.x / 32; //! int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{0, 1, 2, 3, ..., 127}``. The corresponding output ``aggregate`` in threads 0, 32, 64, and 96 //! will be ``496``, ``1520``, ``2544``, and ``3568``, respectively //! (and is undefined in other threads). //! //! The code snippet below illustrates a single warp sum reduction within a block of //! 128 threads. //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpReduce for type int //! using WarpReduce = cub::WarpReduce; //! //! // Allocate WarpReduce shared memory for one warp //! __shared__ typename WarpReduce::TempStorage temp_storage; //! ... //! //! // Only the first warp performs a reduction //! if (threadIdx.x < 32) //! { //! // Obtain one input item per thread //! int thread_data = ... //! //! // Return the warp-wide sum to lane0 //! int aggregate = WarpReduce(temp_storage).Sum(thread_data); //! //! Suppose the set of input ``thread_data`` across the warp of threads is //! ``{0, 1, 2, 3, ..., 31}``. The corresponding output ``aggregate`` in thread0 will be ``496`` //! (and is undefined in other threads). //! @endrst //! //! @tparam T //! The reduction input/output element type //! //! @tparam LOGICAL_WARP_THREADS //! [optional] The number of threads per "logical" warp (may be less than the number of //! hardware warp threads). Default is the warp size of the targeted CUDA compute-capability //! (e.g., 32 threads for SM20). //! //! @tparam LEGACY_PTX_ARCH //! [optional] Unused. template class WarpReduce { private: /****************************************************************************** * Constants and type definitions ******************************************************************************/ enum { /// Whether the logical warp size and the PTX warp size coincide IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(0)), /// Whether the logical warp size is a power-of-two IS_POW_OF_TWO = PowerOfTwo::VALUE, }; public: #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /// Internal specialization. /// Use SHFL-based reduction if LOGICAL_WARP_THREADS is a power-of-two using InternalWarpReduce = ::cuda::std::_If, WarpReduceSmem>; #endif // DOXYGEN_SHOULD_SKIP_THIS private: /// Shared memory storage layout type for WarpReduce using _TempStorage = typename InternalWarpReduce::TempStorage; /****************************************************************************** * Thread fields ******************************************************************************/ /// Shared storage reference _TempStorage& temp_storage; /****************************************************************************** * Utility methods ******************************************************************************/ public: /// \smemstorage{WarpReduce} struct TempStorage : Uninitialized<_TempStorage> {}; //! @name Collective constructors //! @{ //! @rst //! Collective constructor using the specified memory allocation as temporary storage. //! Logical warp and lane identifiers are constructed from ``threadIdx.x``. //! @endrst //! //! @param[in] temp_storage Reference to memory allocation having layout type TempStorage _CCCL_DEVICE _CCCL_FORCEINLINE WarpReduce(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) {} //! @} end member group //! @name Summation reductions //! @{ //! @rst //! Computes a warp-wide sum in the calling warp. //! The output is valid in warp *lane*\ :sub:`0`. //! //! @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates four concurrent warp sum reductions within a block of //! 128 threads (one per each of the 32-thread warps). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpReduce for type int //! using WarpReduce = cub::WarpReduce; //! //! // Allocate WarpReduce shared memory for 4 warps //! __shared__ typename WarpReduce::TempStorage temp_storage[4]; //! //! // Obtain one input item per thread //! int thread_data = ... //! //! // Return the warp-wide sums to each lane0 //! int warp_id = threadIdx.x / 32; //! int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{0, 1, 2, 3, ..., 127}``. //! The corresponding output ``aggregate`` in threads 0, 32, 64, and 96 will ``496``, ``1520``, //! ``2544``, and ``3568``, respectively (and is undefined in other threads). //! @endrst //! //! @param[in] input Calling thread's input _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input) { return InternalWarpReduce(temp_storage).template Reduce(input, LOGICAL_WARP_THREADS, cub::Sum()); } //! @rst //! Computes a partially-full warp-wide sum in the calling warp. //! The output is valid in warp *lane*\ :sub:`0`. //! //! All threads across the calling warp must agree on the same value for ``valid_items``. //! Otherwise the result is undefined. //! //! @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a sum reduction within a single, partially-full //! block of 32 threads (one warp). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(int *d_data, int valid_items) //! { //! // Specialize WarpReduce for type int //! using WarpReduce = cub::WarpReduce; //! //! // Allocate WarpReduce shared memory for one warp //! __shared__ typename WarpReduce::TempStorage temp_storage; //! //! // Obtain one input item per thread if in range //! int thread_data; //! if (threadIdx.x < valid_items) //! thread_data = d_data[threadIdx.x]; //! //! // Return the warp-wide sums to each lane0 //! int aggregate = WarpReduce(temp_storage).Sum( //! thread_data, valid_items); //! //! Suppose the input ``d_data`` is ``{0, 1, 2, 3, 4, ...`` and ``valid_items`` is ``4``. //! The corresponding output ``aggregate`` in *lane*\ :sub:`0` is ``6`` //! (and is undefined in other threads). //! @endrst //! //! @param[in] input //! Calling thread's input //! //! @param[in] valid_items //! Total number of valid items in the calling thread's logical warp //! (may be less than ``LOGICAL_WARP_THREADS``) _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input, int valid_items) { // Determine if we don't need bounds checking return InternalWarpReduce(temp_storage).template Reduce(input, valid_items, cub::Sum()); } //! @rst //! Computes a segmented sum in the calling warp where segments are defined by head-flags. //! The sum of each segment is returned to the first lane in that segment //! (which always includes *lane*\ :sub:`0`). //! //! @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a head-segmented warp sum //! reduction within a block of 32 threads (one warp). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpReduce for type int //! using WarpReduce = cub::WarpReduce; //! //! // Allocate WarpReduce shared memory for one warp //! __shared__ typename WarpReduce::TempStorage temp_storage; //! //! // Obtain one input item and flag per thread //! int thread_data = ... //! int head_flag = ... //! //! // Return the warp-wide sums to each lane0 //! int aggregate = WarpReduce(temp_storage).HeadSegmentedSum( //! thread_data, head_flag); //! //! Suppose the set of input ``thread_data`` and ``head_flag`` across the block of threads //! is ``{0, 1, 2, 3, ..., 31`` and is ``{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0``, //! respectively. The corresponding output ``aggregate`` in threads 0, 4, 8, etc. will be //! ``6``, ``22``, ``38``, etc. (and is undefined in other threads). //! @endrst //! //! @tparam ReductionOp //! **[inferred]** Binary reduction operator type having member //! `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input //! //! @param[in] head_flag //! Head flag denoting whether or not `input` is the start of a new segment template _CCCL_DEVICE _CCCL_FORCEINLINE T HeadSegmentedSum(T input, FlagT head_flag) { return HeadSegmentedReduce(input, head_flag, cub::Sum()); } //! @rst //! Computes a segmented sum in the calling warp where segments are defined by tail-flags. //! The sum of each segment is returned to the first lane in that segment //! (which always includes *lane*\ :sub:`0`). //! //! @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a tail-segmented warp sum reduction within a block of 32 //! threads (one warp). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpReduce for type int //! using WarpReduce = cub::WarpReduce; //! //! // Allocate WarpReduce shared memory for one warp //! __shared__ typename WarpReduce::TempStorage temp_storage; //! //! // Obtain one input item and flag per thread //! int thread_data = ... //! int tail_flag = ... //! //! // Return the warp-wide sums to each lane0 //! int aggregate = WarpReduce(temp_storage).TailSegmentedSum( //! thread_data, tail_flag); //! //! Suppose the set of input ``thread_data`` and ``tail_flag`` across the block of threads //! is ``{0, 1, 2, 3, ..., 31}`` and is ``{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1}``, //! respectively. The corresponding output ``aggregate`` in threads 0, 4, 8, etc. will be //! ``6``, ``22``, ``38``, etc. (and is undefined in other threads). //! @endrst //! //! @tparam ReductionOp //! **[inferred]** Binary reduction operator type having member //! `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input //! //! @param[in] tail_flag //! Head flag denoting whether or not `input` is the start of a new segment template _CCCL_DEVICE _CCCL_FORCEINLINE T TailSegmentedSum(T input, FlagT tail_flag) { return TailSegmentedReduce(input, tail_flag, cub::Sum()); } //! @} end member group //! @name Generic reductions //! @{ //! @rst //! Computes a warp-wide reduction in the calling warp using the specified binary reduction //! functor. The output is valid in warp *lane*\ :sub:`0`. //! //! Supports non-commutative reduction operators //! //! @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates four concurrent warp max reductions within a block of //! 128 threads (one per each of the 32-thread warps). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpReduce for type int //! using WarpReduce = cub::WarpReduce; //! //! // Allocate WarpReduce shared memory for 4 warps //! __shared__ typename WarpReduce::TempStorage temp_storage[4]; //! //! // Obtain one input item per thread //! int thread_data = ... //! //! // Return the warp-wide reductions to each lane0 //! int warp_id = threadIdx.x / 32; //! int aggregate = WarpReduce(temp_storage[warp_id]).Reduce( //! thread_data, cub::Max()); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{0, 1, 2, 3, ..., 127}``. The corresponding output ``aggregate`` in threads 0, 32, 64, and //! 96 will be ``31``, ``63``, ``95``, and ``127``, respectively //! (and is undefined in other threads). //! @endrst //! //! @tparam ReductionOp //! **[inferred]** Binary reduction operator type having member //! `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input //! //! @param[in] reduction_op //! Binary reduction operator template _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp reduction_op) { return InternalWarpReduce(temp_storage).template Reduce(input, LOGICAL_WARP_THREADS, reduction_op); } //! @rst //! Computes a partially-full warp-wide reduction in the calling warp using the specified binary //! reduction functor. The output is valid in warp *lane*\ :sub:`0`. //! //! All threads across the calling warp must agree on the same value for ``valid_items``. //! Otherwise the result is undefined. //! //! Supports non-commutative reduction operators //! //! @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a max reduction within a single, partially-full //! block of 32 threads (one warp). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(int *d_data, int valid_items) //! { //! // Specialize WarpReduce for type int //! using WarpReduce = cub::WarpReduce; //! //! // Allocate WarpReduce shared memory for one warp //! __shared__ typename WarpReduce::TempStorage temp_storage; //! //! // Obtain one input item per thread if in range //! int thread_data; //! if (threadIdx.x < valid_items) //! thread_data = d_data[threadIdx.x]; //! //! // Return the warp-wide reductions to each lane0 //! int aggregate = WarpReduce(temp_storage).Reduce( //! thread_data, cub::Max(), valid_items); //! //! Suppose the input ``d_data`` is ``{0, 1, 2, 3, 4, ... }`` and ``valid_items`` //! is ``4``. The corresponding output ``aggregate`` in thread0 is ``3`` (and is //! undefined in other threads). //! @endrst //! //! @tparam ReductionOp //! **[inferred]** Binary reduction operator type having member //! `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input //! //! @param[in] reduction_op //! Binary reduction operator //! //! @param[in] valid_items //! Total number of valid items in the calling thread's logical warp //! (may be less than ``LOGICAL_WARP_THREADS``) template _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp reduction_op, int valid_items) { return InternalWarpReduce(temp_storage).template Reduce(input, valid_items, reduction_op); } //! @rst //! Computes a segmented reduction in the calling warp where segments are defined by head-flags. //! The reduction of each segment is returned to the first lane in that segment //! (which always includes *lane*\ :sub:`0`). //! //! Supports non-commutative reduction operators //! //! @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a head-segmented warp max //! reduction within a block of 32 threads (one warp). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpReduce for type int //! using WarpReduce = cub::WarpReduce; //! //! // Allocate WarpReduce shared memory for one warp //! __shared__ typename WarpReduce::TempStorage temp_storage; //! //! // Obtain one input item and flag per thread //! int thread_data = ... //! int head_flag = ... //! //! // Return the warp-wide reductions to each lane0 //! int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce( //! thread_data, head_flag, cub::Max()); //! //! Suppose the set of input ``thread_data`` and ``head_flag`` across the block of threads //! is ``{0, 1, 2, 3, ..., 31}`` and is ``{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0}``, //! respectively. The corresponding output ``aggregate`` in threads 0, 4, 8, etc. will be //! ``3``, ``7``, ``11``, etc. (and is undefined in other threads). //! @endrst //! //! @tparam ReductionOp //! **[inferred]** Binary reduction operator type having member //! `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input //! //! @param[in] head_flag //! Head flag denoting whether or not `input` is the start of a new segment //! //! @param[in] reduction_op //! Reduction operator template _CCCL_DEVICE _CCCL_FORCEINLINE T HeadSegmentedReduce(T input, FlagT head_flag, ReductionOp reduction_op) { return InternalWarpReduce(temp_storage).template SegmentedReduce(input, head_flag, reduction_op); } //! @rst //! Computes a segmented reduction in the calling warp where segments are defined by tail-flags. //! The reduction of each segment is returned to the first lane in that segment //! (which always includes *lane*\ :sub:`0`). //! //! Supports non-commutative reduction operators //! //! @smemwarpreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a tail-segmented warp max //! reduction within a block of 32 threads (one warp). //! //! .. code-block:: c++ //! //! #include //! //! __global__ void ExampleKernel(...) //! { //! // Specialize WarpReduce for type int //! using WarpReduce = cub::WarpReduce; //! //! // Allocate WarpReduce shared memory for one warp //! __shared__ typename WarpReduce::TempStorage temp_storage; //! //! // Obtain one input item and flag per thread //! int thread_data = ... //! int tail_flag = ... //! //! // Return the warp-wide reductions to each lane0 //! int aggregate = WarpReduce(temp_storage).TailSegmentedReduce( //! thread_data, tail_flag, cub::Max()); //! //! Suppose the set of input ``thread_data`` and ``tail_flag`` across the block of threads //! is ``{0, 1, 2, 3, ..., 31}`` and is ``{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1}``, //! respectively. The corresponding output ``aggregate`` in threads 0, 4, 8, etc. will be //! ``3``, ``7``, ``11``, etc. (and is undefined in other threads). //! @endrst //! //! @tparam ReductionOp //! **[inferred]** Binary reduction operator type having member //! `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input //! //! @param[in] tail_flag //! Tail flag denoting whether or not \p input is the end of the current segment //! //! @param[in] reduction_op //! Reduction operator template _CCCL_DEVICE _CCCL_FORCEINLINE T TailSegmentedReduce(T input, FlagT tail_flag, ReductionOp reduction_op) { return InternalWarpReduce(temp_storage).template SegmentedReduce(input, tail_flag, reduction_op); } //! @} end member group }; #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template class WarpReduce { private: using _TempStorage = cub::NullType; public: struct InternalWarpReduce { struct TempStorage : Uninitialized<_TempStorage> {}; _CCCL_DEVICE _CCCL_FORCEINLINE InternalWarpReduce(TempStorage& /*temp_storage */) {} template _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, int /* valid_items */, ReductionOp /* reduction_op */) { return input; } template _CCCL_DEVICE _CCCL_FORCEINLINE T SegmentedReduce(T input, FlagT /* flag */, ReductionOp /* reduction_op */) { return input; } }; using TempStorage = typename InternalWarpReduce::TempStorage; _CCCL_DEVICE _CCCL_FORCEINLINE WarpReduce(TempStorage& /*temp_storage */) {} _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input) { return input; } _CCCL_DEVICE _CCCL_FORCEINLINE T Sum(T input, int /* valid_items */) { return input; } template _CCCL_DEVICE _CCCL_FORCEINLINE T HeadSegmentedSum(T input, FlagT /* head_flag */) { return input; } template _CCCL_DEVICE _CCCL_FORCEINLINE T TailSegmentedSum(T input, FlagT /* tail_flag */) { return input; } template _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp /* reduction_op */) { return input; } template _CCCL_DEVICE _CCCL_FORCEINLINE T Reduce(T input, ReductionOp /* reduction_op */, int /* valid_items */) { return input; } template _CCCL_DEVICE _CCCL_FORCEINLINE T HeadSegmentedReduce(T input, FlagT /* head_flag */, ReductionOp /* reduction_op */) { return input; } template _CCCL_DEVICE _CCCL_FORCEINLINE T TailSegmentedReduce(T input, FlagT /* tail_flag */, ReductionOp /* reduction_op */) { return input; } }; #endif // DOXYGEN_SHOULD_SKIP_THIS CUB_NAMESPACE_END