/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ //! @file //! The cub::BlockScan class provides :ref:`collective ` methods for computing a parallel prefix //! sum/scan of items partitioned across a CUDA thread block. #pragma once #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include #include #include #include CUB_NAMESPACE_BEGIN /****************************************************************************** * Algorithmic variants ******************************************************************************/ //! @brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a //! parallel prefix scan across a CUDA thread block. enum BlockScanAlgorithm { //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! An efficient "raking reduce-then-scan" prefix scan algorithm. Execution is comprised of five phases: //! //! #. Upsweep sequential reduction in registers (if threads contribute more than one input each). //! Each thread then places the partial reduction of its item(s) into shared memory. //! #. Upsweep sequential reduction in shared memory. //! Threads within a single warp rake across segments of shared partial reductions. //! #. A warp-synchronous Kogge-Stone style exclusive scan within the raking warp. //! #. Downsweep sequential exclusive scan in shared memory. //! Threads within a single warp rake across segments of shared partial reductions, //! seeded with the warp-scan output. //! #. Downsweep sequential scan in registers (if threads contribute more than one input), //! seeded with the raking scan output. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! - Although this variant may suffer longer turnaround latencies when the //! GPU is under-occupied, it can often provide higher overall throughput //! across the GPU when suitably occupied. //! //! @endrst BLOCK_SCAN_RAKING, //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at the expense of higher //! register pressure. Raking threads preserve their "upsweep" segment of values in registers while performing //! warp-synchronous scan, allowing the "downsweep" not to re-read them from shared memory. //! //! @endrst BLOCK_SCAN_RAKING_MEMOIZE, //! @rst //! Overview //! ++++++++++++++++++++++++++ //! //! A quick "tiled warpscans" prefix scan algorithm. Execution is comprised of four phases: //! #. Upsweep sequential reduction in registers (if threads contribute more than one input each). //! Each thread then places the partial reduction of its item(s) into shared memory. //! #. Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp. //! #. A propagation phase where the warp scan outputs in each warp are updated with the aggregate //! from each preceding warp. //! #. Downsweep sequential scan in registers (if threads contribute more than one input), //! seeded with the raking scan output. //! //! Performance Considerations //! ++++++++++++++++++++++++++ //! //! - Although this variant may suffer lower overall throughput across the //! GPU because due to a heavy reliance on inefficient warpscans, it can //! often provide lower turnaround latencies when the GPU is under-occupied. //! //! @endrst BLOCK_SCAN_WARP_SCANS, }; //! @rst //! The BlockScan class provides :ref:`collective ` methods for computing a parallel prefix //! sum/scan of items partitioned across a CUDA thread block. //! //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! - Given a list of input elements and a binary reduction operator, a //! `prefix scan `_ produces an output list where each element is computed //! to be the reduction of the elements occurring earlier in the input list. *Prefix sum* connotes a prefix scan with //! the addition operator. The term *inclusive indicates* that the *i*\ :sup:`th` output reduction incorporates //! the *i*\ :sup:`th` input. The term *exclusive* indicates the *i*\ :sup:`th` input is not incorporated into //! the *i*\ :sup:`th` output reduction. //! - @rowmajor //! - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles: //! //! #. :cpp:enumerator:`cub::BLOCK_SCAN_RAKING`: //! An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. //! #. :cpp:enumerator:`cub::BLOCK_SCAN_RAKING_MEMOIZE`: //! Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional //! register pressure for intermediate storage. //! #. :cpp:enumerator:`cub::BLOCK_SCAN_WARP_SCANS`: //! A quick (low latency) "tiled warpscans" prefix scan algorithm. //! //! Performance Considerations //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! - @granularity //! - Uses special instructions when applicable (e.g., warp ``SHFL``) //! - Uses synchronization-free communication between warp lanes when applicable //! - Invokes a minimal number of minimal block-wide synchronization barriers (only //! one or two depending on algorithm selection) //! - Incurs zero bank conflicts for most types //! - Computation is slightly more efficient (i.e., having lower instruction overhead) for: //! //! - Prefix sum variants (vs. generic scan) //! - @blocksize //! //! - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives //! //! A Simple Example //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! @blockcollective{BlockScan} //! //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that //! are partitioned in a :ref:`blocked arrangement ` across 128 threads //! where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! using BlockScan = cub::BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively compute the block-wide exclusive prefix sum //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}``. //! The corresponding output ``thread_data`` in those threads will be //! ``{[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}``. //! //! Re-using dynamically allocating shared memory //! +++++++++++++++++++++++++++++++++++++++++++++ //! //! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with //! BlockReduce and how to re-purpose the same memory region. //! This example can be easily adapted to the storage required by BlockScan. //! //! @endrst //! //! @tparam T //! Data type being scanned //! //! @tparam BLOCK_DIM_X //! The thread block length in threads along the X dimension //! //! @tparam ALGORITHM //! **[optional]** cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use //! (default: cub::BLOCK_SCAN_RAKING) //! //! @tparam BLOCK_DIM_Y //! **[optional]** The thread block length in threads along the Y dimension //! (default: 1) //! //! @tparam BLOCK_DIM_Z //! **[optional]** The thread block length in threads along the Z dimension (default: 1) //! //! @tparam LEGACY_PTX_ARCH //! **[optional]** Unused. template class BlockScan { private: /// Constants enum { /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; /** * Ensure the template parameterization meets the requirements of the * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy * cannot be used with thread block sizes not a multiple of the * architectural warp size. */ static constexpr BlockScanAlgorithm SAFE_ALGORITHM = ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(0) != 0)) ? BLOCK_SCAN_RAKING : ALGORITHM; using WarpScans = BlockScanWarpScans; using Raking = BlockScanRaking; /// Define the delegate type for the desired algorithm using InternalBlockScan = ::cuda::std::_If; /// Shared memory storage layout type for BlockScan using _TempStorage = typename InternalBlockScan::TempStorage; /// Shared storage reference _TempStorage& temp_storage; /// Linear thread-id unsigned int linear_tid; /// Internal storage allocator _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } public: /// @smemstorage{BlockScan} struct TempStorage : Uninitialized<_TempStorage> {}; //! @name Collective constructors //! @{ //! @brief Collective constructor using a private static allocation of shared memory as temporary storage. _CCCL_DEVICE _CCCL_FORCEINLINE BlockScan() : temp_storage(PrivateStorage()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} /** * @brief Collective constructor using the specified memory allocation as temporary storage. * * @param[in] temp_storage * Reference to memory allocation having layout type TempStorage */ _CCCL_DEVICE _CCCL_FORCEINLINE BlockScan(TempStorage& temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //! @} end member group //! @name Exclusive prefix sum operations //! @{ //! @rst //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. //! Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned //! to ``output`` in *thread*\ :sub:`0`. //! //! - @identityzero //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an exclusive prefix sum of 128 integer items that //! are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! using BlockScan = cub::BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain input item for each thread //! int thread_data; //! ... //! //! // Collectively compute the block-wide exclusive prefix sum //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); //! //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``. //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``. //! //! @endrst //! //! @param[in] input //! Calling thread's input item //! //! @param[out] output //! Calling thread's output item (may be aliased to `input`) _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& output) { T initial_value{}; ExclusiveScan(input, output, initial_value, cub::Sum()); } //! @rst //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. //! Each thread contributes one input element. //! The value of 0 is applied as the initial value, and is assigned to ``output`` in *thread*\ :sub:`0`. //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs. //! //! - @identityzero //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an exclusive prefix sum of 128 integer items that //! are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! using BlockScan = cub::BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain input item for each thread //! int thread_data; //! ... //! //! // Collectively compute the block-wide exclusive prefix sum //! int block_aggregate; //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate); //! //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``. //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``. //! Furthermore the value ``128`` will be stored in ``block_aggregate`` for all threads. //! //! @endrst //! //! @param[in] input //! Calling thread's input item //! //! @param[out] output //! Calling thread's output item (may be aliased to `input`) //! //! @param[out] block_aggregate //! block-wide aggregate reduction of input items _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& output, T& block_aggregate) { T initial_value{}; ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate); } //! @rst //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. //! Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor //! ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value returned by //! *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread block's //! scan inputs. Also provides every thread with the block-wide ``block_aggregate`` of all inputs. //! //! - @identityzero //! - The ``block_prefix_callback_op`` functor must implement a member function //! ``T operator()(T block_aggregate)``. The functor's input parameter ``block_aggregate`` is the same value //! also returned by the scan operation. The functor will be invoked by the first warp of threads in the block, //! however only the return value from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful. //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a single thread block that progressively //! computes an exclusive prefix sum over multiple "tiles" of input using a //! prefix functor to maintain a running total between block-wide scans. Each tile consists //! of 128 integer items that are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // A stateful callback functor that maintains a running prefix to be applied //! // during consecutive scan operations. //! struct BlockPrefixCallbackOp //! { //! // Running prefix //! int running_total; //! //! // Constructor //! __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} //! //! // Callback operator to be entered by the first warp of threads in the block. //! // Thread-0 is responsible for returning a value for seeding the block-wide scan. //! __host__ int operator()(int block_aggregate) //! { //! int old_prefix = running_total; //! running_total += block_aggregate; //! return old_prefix; //! } //! }; //! //! __global__ void ExampleKernel(int *d_data, int num_items, ...) //! { //! // Specialize BlockScan for a 1D block of 128 threads //! using BlockScan = cub::BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Initialize running total //! BlockPrefixCallbackOp prefix_op(0); //! //! // Have the block iterate over segments of items //! for (int block_offset = 0; block_offset < num_items; block_offset += 128) //! { //! // Load a segment of consecutive items that are blocked across threads //! int thread_data = d_data[block_offset]; //! //! // Collectively compute the block-wide exclusive prefix sum //! BlockScan(temp_storage).ExclusiveSum( //! thread_data, thread_data, prefix_op); //! CTA_SYNC(); //! //! // Store scanned items to output segment //! d_data[block_offset] = thread_data; //! } //! //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``. //! The corresponding output for the first segment will be ``0, 1, ..., 127``. //! The output for the second segment will be ``128, 129, ..., 255``. //! //! @endrst //! //! @tparam BlockPrefixCallbackOp //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)` //! //! @param[in] input //! Calling thread's input item //! //! @param[out] output //! Calling thread's output item (may be aliased to `input`) //! //! @param[in,out] block_prefix_callback_op //! @rst //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to //! the logical input sequence. //! @endrst template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op) { ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); } //! @} end member group //! @name Exclusive prefix sum operations (multiple data per thread) //! @{ //! @rst //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. //! Each thread contributes an array of consecutive input elements. //! The value of 0 is applied as the initial value, and is assigned to ``output[0]`` in *thread*\ :sub:`0`. //! //! - @identityzero //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that //! are partitioned in a :ref:`blocked arrangement ` across 128 threads //! where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! using BlockScan = cub::BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively compute the block-wide exclusive prefix sum //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. //! The corresponding output ``thread_data`` in those threads will be //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``. //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD]) { T initial_value{}; ExclusiveScan(input, output, initial_value, cub::Sum()); } //! @rst //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. //! Each thread contributes an array of consecutive input elements. //! The value of 0 is applied as the initial value, and is assigned to ``output[0]`` in *thread*\ :sub:`0`. //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs. //! //! - @identityzero //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an exclusive prefix sum of 512 integer items that are partitioned in //! a :ref:`blocked arrangement ` across 128 threads where each thread owns //! 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! using BlockScan = cub::BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively compute the block-wide exclusive prefix sum //! int block_aggregate; //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. //! The corresponding output ``thread_data`` in those threads will be //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``. //! Furthermore the value ``512`` will be stored in ``block_aggregate`` for all threads. //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) //! //! @param[out] block_aggregate //! block-wide aggregate reduction of input items template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T& block_aggregate) { // Reduce consecutive thread items in registers T initial_value{}; ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate); } //! @rst //! Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. //! Each thread contributes an array of consecutive input elements. //! Instead of using 0 as the block-wide prefix, the call-back functor ``block_prefix_callback_op`` is invoked by //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed" //! value that logically prefixes the thread block's scan inputs. //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs. //! //! - @identityzero //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``. //! The functor's input parameter ``block_aggregate`` is the same value also returned //! by the scan operation. The functor will be invoked by the first warp of threads in //! the block, however only the return value from *lane*\ :sub:`0` is applied as the block-wide prefix. //! Can be stateful. //! - @blocked //! - @granularity //! - @smemreuse //! //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a single thread block that progressively //! computes an exclusive prefix sum over multiple "tiles" of input using a //! prefix functor to maintain a running total between block-wide scans. Each tile consists //! of 512 integer items that are partitioned in a :ref:`blocked arrangement ` //! across 128 threads where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // A stateful callback functor that maintains a running prefix to be applied //! // during consecutive scan operations. //! struct BlockPrefixCallbackOp //! { //! // Running prefix //! int running_total; //! //! // Constructor //! __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} //! //! // Callback operator to be entered by the first warp of threads in the block. //! // Thread-0 is responsible for returning a value for seeding the block-wide scan. //! __host__ int operator()(int block_aggregate) //! { //! int old_prefix = running_total; //! running_total += block_aggregate; //! return old_prefix; //! } //! }; //! //! __global__ void ExampleKernel(int *d_data, int num_items, ...) //! { //! // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread //! using BlockLoad = cub::BlockLoad; //! using BlockStore = cub::BlockStore; //! using BlockScan = cub::BlockScan; //! //! // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan //! __shared__ union { //! typename BlockLoad::TempStorage load; //! typename BlockScan::TempStorage scan; //! typename BlockStore::TempStorage store; //! } temp_storage; //! //! // Initialize running total //! BlockPrefixCallbackOp prefix_op(0); //! //! // Have the block iterate over segments of items //! for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) //! { //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); //! CTA_SYNC(); //! //! // Collectively compute the block-wide exclusive prefix sum //! int block_aggregate; //! BlockScan(temp_storage.scan).ExclusiveSum( //! thread_data, thread_data, prefix_op); //! CTA_SYNC(); //! //! // Store scanned items to output segment //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); //! CTA_SYNC(); //! } //! //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``. //! The corresponding output for the first segment will be ``0, 1, 2, 3, ..., 510, 511``. //! The output for the second segment will be ``512, 513, 514, 515, ..., 1022, 1023``. //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam BlockPrefixCallbackOp //! **[inferred]** Call-back functor type having member //! `T operator()(T block_aggregate)` //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) //! //! @param[in,out] block_prefix_callback_op //! @rst //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to //! the logical input sequence. //! @endrst template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveSum( T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], BlockPrefixCallbackOp& block_prefix_callback_op) { ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); } //! @} end member group // Exclusive prefix sums //! @name Exclusive prefix scan operations //! @{ //! @rst //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes one input element. //! //! - Supports non-commutative scan operators. //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an exclusive prefix max scan of 128 integer items that //! are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! using BlockScan = cub::BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain input item for each thread //! int thread_data; //! ... //! //! // Collectively compute the block-wide exclusive prefix max scan //! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); //! //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``. //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``. //! //! @endrst //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input item //! //! @param[out] output //! Calling thread's output item (may be aliased to `input`) //! //! @param[in] initial_value //! @rst //! Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`) //! @endrst //! //! @param[in] scan_op //! Binary scan functor template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, T initial_value, ScanOp scan_op) { InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op); } //! @rst //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes one input element. //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs. //! //! - Supports non-commutative scan operators. //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an exclusive prefix max scan of 128 integer items that //! are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! using BlockScan = cub::BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain input item for each thread //! int thread_data; //! ... //! //! // Collectively compute the block-wide exclusive prefix max scan //! int block_aggregate; //! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate); //! //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``. //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``. //! Furthermore the value ``126`` will be stored in ``block_aggregate`` for all threads. //! //! @endrst //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member ``T operator()(const T &a, const T &b)`` //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to ``input``) //! //! @param[in] initial_value //! @rst //! Initial value to seed the exclusive scan (and is assigned to ``output[0]`` in *thread*\ :sub:`0`) //! @endrst //! //! @param[in] scan_op //! Binary scan functor //! //! @param[out] block_aggregate //! block-wide aggregate reduction of input items template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, T initial_value, ScanOp scan_op, T& block_aggregate) { InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate); } //! @rst //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes one input element. The call-back functor ``block_prefix_callback_op`` is invoked by //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as //! the "seed" value that logically prefixes the thread block's scan inputs. //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs. //! //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``. //! The functor's input parameter ``block_aggregate`` is the same value also returned by the scan operation. //! The functor will be invoked by the first warp of threads in the block, however only the return value from //! *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful. //! - Supports non-commutative scan operators. //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a single thread block that progressively //! computes an exclusive prefix max scan over multiple "tiles" of input using a //! prefix functor to maintain a running total between block-wide scans. //! Each tile consists of 128 integer items that are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // A stateful callback functor that maintains a running prefix to be applied //! // during consecutive scan operations. //! struct BlockPrefixCallbackOp //! { //! // Running prefix //! int running_total; //! //! // Constructor //! __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} //! //! // Callback operator to be entered by the first warp of threads in the block. //! // Thread-0 is responsible for returning a value for seeding the block-wide scan. //! __host__ int operator()(int block_aggregate) //! { //! int old_prefix = running_total; //! running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; //! return old_prefix; //! } //! }; //! //! __global__ void ExampleKernel(int *d_data, int num_items, ...) //! { //! // Specialize BlockScan for a 1D block of 128 threads //! using BlockScan = cub::BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Initialize running total //! BlockPrefixCallbackOp prefix_op(INT_MIN); //! //! // Have the block iterate over segments of items //! for (int block_offset = 0; block_offset < num_items; block_offset += 128) //! { //! // Load a segment of consecutive items that are blocked across threads //! int thread_data = d_data[block_offset]; //! //! // Collectively compute the block-wide exclusive prefix max scan //! BlockScan(temp_storage).ExclusiveScan( //! thread_data, thread_data, INT_MIN, cub::Max(), prefix_op); //! CTA_SYNC(); //! //! // Store scanned items to output segment //! d_data[block_offset] = thread_data; //! } //! //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``. //! The corresponding output for the first segment will be ``INT_MIN, 0, 0, 2, ..., 124, 126``. //! The output for the second segment will be ``126, 128, 128, 130, ..., 252, 254``. //! //! @endrst //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @tparam BlockPrefixCallbackOp //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)` //! //! @param[in] input //! Calling thread's input item //! //! @param[out] output //! Calling thread's output item (may be aliased to `input`) //! //! @param[in] scan_op //! Binary scan functor //! //! @param[in,out] block_prefix_callback_op //! @rst //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to //! the logical input sequence. //! @endrst template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op) { InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op); } //! @} end member group // Inclusive prefix sums //! @name Exclusive prefix scan operations (multiple data per thread) //! @{ //! @rst //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes an array of consecutive input elements. //! //! - Supports non-commutative scan operators. //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an exclusive prefix max scan of 512 integer //! items that are partitioned in a [blocked arrangement](../index.html#sec5sec3) //! across 128 threads where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! using BlockScan = cub::BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively compute the block-wide exclusive prefix max scan //! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``. //! The corresponding output ``thread_data`` in those threads will be //! ``{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }``. //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member //! `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) //! //! @param[in] initial_value //! @rst //! Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`) //! @endrst //! //! @param[in] scan_op //! Binary scan functor template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op) { // Reduce consecutive thread items in registers T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op); // Exclusive scan in registers with prefix as seed internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); } //! @rst //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes an array of consecutive input elements. //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs. //! //! - Supports non-commutative scan operators. //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an exclusive prefix max scan of 512 integer items that are partitioned in //! a :ref:`blocked arrangement ` across 128 threads where each thread owns //! 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! using BlockScan = cub::BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively compute the block-wide exclusive prefix max scan //! int block_aggregate; //! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``. //! The corresponding output ``thread_data`` in those threads will be //! ``{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }``. //! Furthermore the value ``510`` will be stored in ``block_aggregate`` for all threads. //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) //! //! @param[in] initial_value //! @rst //! Initial value to seed the exclusive scan (and is assigned to `output[0]` in *thread*\ :sub:`0`) //! @endrst //! //! @param[in] scan_op //! Binary scan functor //! //! @param[out] block_aggregate //! block-wide aggregate reduction of input items template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan( T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op, T& block_aggregate) { // Reduce consecutive thread items in registers T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate); // Exclusive scan in registers with prefix as seed internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); } //! @rst //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes an array of consecutive input elements. //! The call-back functor ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value //! returned by *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread //! block's scan inputs. Also provides every thread with the block-wide ``block_aggregate`` of all inputs. //! //! - The ``block_prefix_callback_op`` functor must implement a member function //! ``T operator()(T block_aggregate)``. The functor's input parameter ``block_aggregate`` //! is the same value also returned by the scan operation. The functor will be invoked by the //! first warp of threads in the block, however only the return value from //! *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful. //! - Supports non-commutative scan operators. //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a single thread block that progressively //! computes an exclusive prefix max scan over multiple "tiles" of input using a //! prefix functor to maintain a running total between block-wide scans. Each tile consists //! of 128 integer items that are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // A stateful callback functor that maintains a running prefix to be applied //! // during consecutive scan operations. //! struct BlockPrefixCallbackOp //! { //! // Running prefix //! int running_total; //! //! // Constructor //! __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} //! //! // Callback operator to be entered by the first warp of threads in the block. //! // Thread-0 is responsible for returning a value for seeding the block-wide scan. //! __host__ int operator()(int block_aggregate) //! { //! int old_prefix = running_total; //! running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; //! return old_prefix; //! } //! }; //! //! __global__ void ExampleKernel(int *d_data, int num_items, ...) //! { //! // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread //! using BlockLoad = cub::BlockLoad ; //! using BlockStore = cub::BlockStore ; //! using BlockScan = cub::BlockScan ; //! //! // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan //! __shared__ union { //! typename BlockLoad::TempStorage load; //! typename BlockScan::TempStorage scan; //! typename BlockStore::TempStorage store; //! } temp_storage; //! //! // Initialize running total //! BlockPrefixCallbackOp prefix_op(0); //! //! // Have the block iterate over segments of items //! for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) //! { //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); //! CTA_SYNC(); //! //! // Collectively compute the block-wide exclusive prefix max scan //! BlockScan(temp_storage.scan).ExclusiveScan( //! thread_data, thread_data, INT_MIN, cub::Max(), prefix_op); //! CTA_SYNC(); //! //! // Store scanned items to output segment //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); //! CTA_SYNC(); //! } //! //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``. //! The corresponding output for the first segment will be //! ``INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510``. //! The output for the second segment will be //! ``510, 512, 512, 514, 514, 516, ..., 1020, 1022``. //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @tparam BlockPrefixCallbackOp //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)` //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) //! //! @param[in] scan_op //! Binary scan functor //! //! @param[in,out] block_prefix_callback_op //! @rst //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to //! the logical input sequence. //! @endrst template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan( T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op) { // Reduce consecutive thread items in registers T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op); // Exclusive scan in registers with prefix as seed internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); } //! @} end member group #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans //! @name Exclusive prefix scan operations (no initial value, single datum per thread) //! @{ //! @rst //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes one input element. //! With no initial value, the output computed for *thread*\ :sub:`0` is undefined. //! //! - Supports non-commutative scan operators. //! - @rowmajor //! - @smemreuse //! //! @endrst //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input item //! //! @param[out] output //! Calling thread's output item (may be aliased to `input`) //! //! @param[in] scan_op //! Binary scan functor template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, ScanOp scan_op) { InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op); } //! @rst //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes one input element. Also provides every thread with the block-wide //! ``block_aggregate`` of all inputs. With no initial value, the output computed for //! *thread*\ :sub:`0` is undefined. //! //! - Supports non-commutative scan operators. //! - @rowmajor //! - @smemreuse //! //! @endrst //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input item //! //! @param[out] output //! Calling thread's output item (may be aliased to `input`) //! //! @param[in] scan_op //! Binary scan functor //! //! @param[out] block_aggregate //! block-wide aggregate reduction of input items template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T input, T& output, ScanOp scan_op, T& block_aggregate) { InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate); } //! @} end member group //! @name Exclusive prefix scan operations (no initial value, multiple data per thread) //! @{ //! @rst //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes an array of consecutive input elements. With no initial value, the //! output computed for *thread*\ :sub:`0` is undefined. //! //! - Supports non-commutative scan operators. //! - @blocked //! - @granularity //! - @smemreuse //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) //! //! @param[in] scan_op //! Binary scan functor template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op) { // Reduce consecutive thread items in registers T thread_partial = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveScan(thread_partial, thread_partial, scan_op); // Exclusive scan in registers with prefix internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); } //! @rst //! Computes an exclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes an array of consecutive input elements. Also provides every thread //! with the block-wide ``block_aggregate`` of all inputs. //! With no initial value, the output computed for *thread*\ :sub:`0` is undefined. //! //! - Supports non-commutative scan operators. //! - @blocked //! - @granularity //! - @smemreuse //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) //! //! @param[in] scan_op //! Binary scan functor //! //! @param[out] block_aggregate //! block-wide aggregate reduction of input items template _CCCL_DEVICE _CCCL_FORCEINLINE void ExclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op, T& block_aggregate) { // Reduce consecutive thread items in registers T thread_partial = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate); // Exclusive scan in registers with prefix internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); } //! @} end member group #endif // DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans //! @name Inclusive prefix sum operations //! @{ //! @rst //! Computes an inclusive block-wide prefix scan using addition (+) //! as the scan operator. Each thread contributes one input element. //! //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an inclusive prefix sum of 128 integer items that //! are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! using BlockScan = cub::BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain input item for each thread //! int thread_data; //! ... //! //! // Collectively compute the block-wide inclusive prefix sum //! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data); //! //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``. //! The corresponding output ``thread_data`` in those threads will be ``1, 2, ..., 128``. //! //! @endrst //! //! @param[in] input //! Calling thread's input item //! //! @param[out] output //! Calling thread's output item (may be aliased to `input`) _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& output) { InclusiveScan(input, output, cub::Sum()); } //! @rst //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. //! Each thread contributes one input element. //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs. //! //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an inclusive prefix sum of 128 integer items that //! are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! using BlockScan = cub::BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain input item for each thread //! int thread_data; //! ... //! //! // Collectively compute the block-wide inclusive prefix sum //! int block_aggregate; //! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); //! //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``. //! The corresponding output ``thread_data`` in those threads will be ``1, 2, ..., 128``. //! Furthermore the value ``128`` will be stored in ``block_aggregate`` for all threads. //! //! @endrst //! //! @param[in] input //! Calling thread's input item //! //! @param[out] output //! Calling thread's output item (may be aliased to `input`) //! //! @param[out] block_aggregate //! block-wide aggregate reduction of input items _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& output, T& block_aggregate) { InclusiveScan(input, output, cub::Sum(), block_aggregate); } //! @rst //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. //! Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor //! ``block_prefix_callback_op`` is invoked by the first warp in the block, and the value returned by //! *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the thread block's //! scan inputs. Also provides every thread with the block-wide ``block_aggregate`` of all inputs. //! //! - The ``block_prefix_callback_op`` functor must implement a member function //! ``T operator()(T block_aggregate)``. The functor's input parameter //! ``block_aggregate`` is the same value also returned by the scan operation. //! The functor will be invoked by the first warp of threads in the block, //! however only the return value from *lane*\ :sub:`0` is applied //! as the block-wide prefix. Can be stateful. //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a single thread block that progressively //! computes an inclusive prefix sum over multiple "tiles" of input using a //! prefix functor to maintain a running total between block-wide scans. //! Each tile consists of 128 integer items that are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // A stateful callback functor that maintains a running prefix to be applied //! // during consecutive scan operations. //! struct BlockPrefixCallbackOp //! { //! // Running prefix //! int running_total; //! //! // Constructor //! __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} //! //! // Callback operator to be entered by the first warp of threads in the block. //! // Thread-0 is responsible for returning a value for seeding the block-wide scan. //! __host__ int operator()(int block_aggregate) //! { //! int old_prefix = running_total; //! running_total += block_aggregate; //! return old_prefix; //! } //! }; //! //! __global__ void ExampleKernel(int *d_data, int num_items, ...) //! { //! // Specialize BlockScan for a 1D block of 128 threads //! using BlockScan = cub::BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Initialize running total //! BlockPrefixCallbackOp prefix_op(0); //! //! // Have the block iterate over segments of items //! for (int block_offset = 0; block_offset < num_items; block_offset += 128) //! { //! // Load a segment of consecutive items that are blocked across threads //! int thread_data = d_data[block_offset]; //! //! // Collectively compute the block-wide inclusive prefix sum //! BlockScan(temp_storage).InclusiveSum( //! thread_data, thread_data, prefix_op); //! CTA_SYNC(); //! //! // Store scanned items to output segment //! d_data[block_offset] = thread_data; //! } //! //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``. //! The corresponding output for the first segment will be ``1, 2, ..., 128``. //! The output for the second segment will be ``129, 130, ..., 256``. //! //! @endrst //! //! @tparam BlockPrefixCallbackOp //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)` //! //! @param[in] input //! Calling thread's input item //! //! @param[out] output //! Calling thread's output item (may be aliased to `input`) //! //! @param[in,out] block_prefix_callback_op //! @rst //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied //! to the logical input sequence. //! @endrst template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op) { InclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); } //! @} end member group //! @name Inclusive prefix sum operations (multiple data per thread) //! @{ //! @rst //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. //! Each thread contributes an array of consecutive input elements. //! //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an inclusive prefix sum of 512 integer items that //! are partitioned in a :ref:`blocked arrangement ` across 128 threads //! where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! using BlockScan = cub::BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively compute the block-wide inclusive prefix sum //! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. The corresponding output //! ``thread_data`` in those threads will be ``{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }``. //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD]) { if (ITEMS_PER_THREAD == 1) { InclusiveSum(input[0], output[0]); } else { // Reduce consecutive thread items in registers Sum scan_op; T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveSum(thread_prefix, thread_prefix); // Inclusive scan in registers with prefix as seed internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); } } //! @rst //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. //! Each thread contributes an array of consecutive input elements. //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs. //! //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an inclusive prefix sum of 512 integer items that //! are partitioned in a :ref:`blocked arrangement ` across 128 threads //! where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! using BlockScan = cub::BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively compute the block-wide inclusive prefix sum //! int block_aggregate; //! BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``. The //! corresponding output ``thread_data`` in those threads will be //! ``{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }``. //! Furthermore the value ``512`` will be stored in ``block_aggregate`` for all threads. //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) //! //! @param[out] block_aggregate //! block-wide aggregate reduction of input items template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T& block_aggregate) { if (ITEMS_PER_THREAD == 1) { InclusiveSum(input[0], output[0], block_aggregate); } else { // Reduce consecutive thread items in registers Sum scan_op; T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveSum(thread_prefix, thread_prefix, block_aggregate); // Inclusive scan in registers with prefix as seed internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); } } //! @rst //! Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. //! Each thread contributes an array of consecutive input elements. //! Instead of using 0 as the block-wide prefix, the call-back functor ``block_prefix_callback_op`` is invoked by //! the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed" //! value that logically prefixes the thread block's scan inputs. Also provides every thread with the //! block-wide ``block_aggregate`` of all inputs. //! //! - The ``block_prefix_callback_op`` functor must implement a member function //! ``T operator()(T block_aggregate)``. The functor's input parameter //! ``block_aggregate`` is the same value also returned by the scan operation. //! The functor will be invoked by the first warp of threads in the block, //! however only the return value from *lane*\ :sub:`0` is applied //! as the block-wide prefix. Can be stateful. //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a single thread block that progressively //! computes an inclusive prefix sum over multiple "tiles" of input using a //! prefix functor to maintain a running total between block-wide scans. Each tile consists //! of 512 integer items that are partitioned in a :ref:`blocked arrangement ` //! across 128 threads where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // A stateful callback functor that maintains a running prefix to be applied //! // during consecutive scan operations. //! struct BlockPrefixCallbackOp //! { //! // Running prefix //! int running_total; //! //! // Constructor //! __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} //! //! // Callback operator to be entered by the first warp of threads in the block. //! // Thread-0 is responsible for returning a value for seeding the block-wide scan. //! __host__ int operator()(int block_aggregate) //! { //! int old_prefix = running_total; //! running_total += block_aggregate; //! return old_prefix; //! } //! }; //! //! __global__ void ExampleKernel(int *d_data, int num_items, ...) //! { //! // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread //! using BlockLoad = cub::BlockLoad ; //! using BlockStore = cub::BlockStore ; //! using BlockScan = cub::BlockScan ; //! //! // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan //! __shared__ union { //! typename BlockLoad::TempStorage load; //! typename BlockScan::TempStorage scan; //! typename BlockStore::TempStorage store; //! } temp_storage; //! //! // Initialize running total //! BlockPrefixCallbackOp prefix_op(0); //! //! // Have the block iterate over segments of items //! for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) //! { //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); //! CTA_SYNC(); //! //! // Collectively compute the block-wide inclusive prefix sum //! BlockScan(temp_storage.scan).IncluisveSum( //! thread_data, thread_data, prefix_op); //! CTA_SYNC(); //! //! // Store scanned items to output segment //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); //! CTA_SYNC(); //! } //! //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``. //! The corresponding output for the first segment will be //! ``1, 2, 3, 4, ..., 511, 512``. The output for the second segment will be //! ``513, 514, 515, 516, ..., 1023, 1024``. //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam BlockPrefixCallbackOp //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)` //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) //! //! @param[in,out] block_prefix_callback_op //! @rst //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to the //! logical input sequence. //! @endrst template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveSum( T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], BlockPrefixCallbackOp& block_prefix_callback_op) { if (ITEMS_PER_THREAD == 1) { InclusiveSum(input[0], output[0], block_prefix_callback_op); } else { // Reduce consecutive thread items in registers Sum scan_op; T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op); // Inclusive scan in registers with prefix as seed internal::ThreadScanInclusive(input, output, scan_op, thread_prefix); } } //! @} end member group //! @name Inclusive prefix scan operations //! @{ //! @rst //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes one input element. //! //! - Supports non-commutative scan operators. //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that //! are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! using BlockScan = cub::BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain input item for each thread //! int thread_data; //! ... //! //! // Collectively compute the block-wide inclusive prefix max scan //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max()); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data`` //! in those threads will be ``0, 0, 2, 2, ..., 126, 126``. //! //! @endrst //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input item //! //! @param[out] output //! Calling thread's output item (may be aliased to `input`) //! //! @param[in] scan_op //! Binary scan functor template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, ScanOp scan_op) { InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op); } //! @rst //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes one input element. Also provides every thread with the block-wide //! ``block_aggregate`` of all inputs. //! //! - Supports non-commutative scan operators. //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an inclusive prefix max scan of 128 //! integer items that are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! using BlockScan = cub::BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain input item for each thread //! int thread_data; //! ... //! //! // Collectively compute the block-wide inclusive prefix max scan //! int block_aggregate; //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``0, -1, 2, -3, ..., 126, -127``. The corresponding output ``thread_data`` //! in those threads will be ``0, 0, 2, 2, ..., 126, 126``. Furthermore the value //! ``126`` will be stored in ``block_aggregate`` for all threads. //! //! @endrst //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input item //! //! @param[out] output //! Calling thread's output item (may be aliased to `input`) //! //! @param[in] scan_op //! Binary scan functor //! //! @param[out] block_aggregate //! Block-wide aggregate reduction of input items template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, ScanOp scan_op, T& block_aggregate) { InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate); } //! @rst //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes one input element. The call-back functor ``block_prefix_callback_op`` //! is invoked by the first warp in the block, and the value returned by *lane*\ :sub:`0` in that warp is used as //! the "seed" value that logically prefixes the thread block's scan inputs. //! Also provides every thread with the block-wide ``block_aggregate`` of all inputs. //! //! - The ``block_prefix_callback_op`` functor must implement a member function //! ``T operator()(T block_aggregate)``. The functor's input parameter //! ``block_aggregate`` is the same value also returned by the scan operation. //! The functor will be invoked by the first warp of threads in the block, //! however only the return value from *lane*\ :sub:`0` is applied //! as the block-wide prefix. Can be stateful. //! - Supports non-commutative scan operators. //! - @rowmajor //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a single thread block that progressively //! computes an inclusive prefix max scan over multiple "tiles" of input using a //! prefix functor to maintain a running total between block-wide scans. Each tile consists //! of 128 integer items that are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // A stateful callback functor that maintains a running prefix to be applied //! // during consecutive scan operations. //! struct BlockPrefixCallbackOp //! { //! // Running prefix //! int running_total; //! //! // Constructor //! __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} //! //! // Callback operator to be entered by the first warp of threads in the block. //! // Thread-0 is responsible for returning a value for seeding the block-wide scan. //! __host__ int operator()(int block_aggregate) //! { //! int old_prefix = running_total; //! running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; //! return old_prefix; //! } //! }; //! //! __global__ void ExampleKernel(int *d_data, int num_items, ...) //! { //! // Specialize BlockScan for a 1D block of 128 threads //! using BlockScan = cub::BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Initialize running total //! BlockPrefixCallbackOp prefix_op(INT_MIN); //! //! // Have the block iterate over segments of items //! for (int block_offset = 0; block_offset < num_items; block_offset += 128) //! { //! // Load a segment of consecutive items that are blocked across threads //! int thread_data = d_data[block_offset]; //! //! // Collectively compute the block-wide inclusive prefix max scan //! BlockScan(temp_storage).InclusiveScan( //! thread_data, thread_data, cub::Max(), prefix_op); //! CTA_SYNC(); //! //! // Store scanned items to output segment //! d_data[block_offset] = thread_data; //! } //! //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``. //! The corresponding output for the first segment will be //! ``0, 0, 2, 2, ..., 126, 126``. The output for the second segment //! will be ``128, 128, 130, 130, ..., 254, 254``. //! //! @endrst //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @tparam BlockPrefixCallbackOp //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)` //! //! @param[in] input //! Calling thread's input item //! //! @param[out] output //! Calling thread's output item (may be aliased to `input`) //! //! @param[in] scan_op //! Binary scan functor //! //! @param[in,out] block_prefix_callback_op //! @rst //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to //! the logical input sequence. //! @endrst template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T input, T& output, ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op) { InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op); } //! @} end member group //! @name Inclusive prefix scan operations (multiple data per thread) //! @{ //! @rst //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes an array of consecutive input elements. //! //! - Supports non-commutative scan operators. //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that //! are partitioned in a [blocked arrangement](../index.html#sec5sec3) across 128 threads //! where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! using BlockScan = cub::BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively compute the block-wide inclusive prefix max scan //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max()); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``. //! The corresponding output ``thread_data`` in those threads will be //! ``{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }``. //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) //! //! @param[in] scan_op //! Binary scan functor template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op) { if (ITEMS_PER_THREAD == 1) { InclusiveScan(input[0], output[0], scan_op); } else { // Reduce consecutive thread items in registers T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveScan(thread_prefix, thread_prefix, scan_op); // Inclusive scan in registers with prefix as seed (first thread does not seed) internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); } } //! @rst //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes an array of consecutive input elements. //! //! - Supports non-commutative scan operators. //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that //! are partitioned in a :ref:`blocked arrangement ` across 64 threads //! where each thread owns 2 consecutive items. //! //! .. literalinclude:: ../../../cub/test/catch2_test_block_scan_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin inclusive-scan-array-init-value //! :end-before: example-end inclusive-scan-array-init-value //! //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) //! //! @param[in] initial_value //! Initial value to seed the inclusive scan (uniform across block) //! //! @param[in] scan_op //! Binary scan functor template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op) { // Reduce consecutive thread items in registers T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op); // Exclusive scan in registers with prefix as seed internal::ThreadScanInclusive(input, output, scan_op, thread_prefix); } //! @rst //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes an array of consecutive input elements. Also provides every thread //! with the block-wide ``block_aggregate`` of all inputs. //! //! - Supports non-commutative scan operators. //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an inclusive prefix max scan of 512 integer items that //! are partitioned in a [blocked arrangement](../index.html#sec5sec3) across 128 threads //! where each thread owns 4 consecutive items. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! __global__ void ExampleKernel(...) //! { //! // Specialize BlockScan for a 1D block of 128 threads of type int //! using BlockScan = cub::BlockScan; //! //! // Allocate shared memory for BlockScan //! __shared__ typename BlockScan::TempStorage temp_storage; //! //! // Obtain a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! ... //! //! // Collectively compute the block-wide inclusive prefix max scan //! int block_aggregate; //! BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate); //! //! Suppose the set of input ``thread_data`` across the block of threads is //! ``{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }``. //! The corresponding output ``thread_data`` in those threads will be //! ``{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }``. //! Furthermore the value ``510`` will be stored in ``block_aggregate`` for all threads. //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) //! //! @param[in] scan_op //! Binary scan functor //! //! @param[out] block_aggregate //! Block-wide aggregate reduction of input items template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan(T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op, T& block_aggregate) { if (ITEMS_PER_THREAD == 1) { InclusiveScan(input[0], output[0], scan_op, block_aggregate); } else { // Reduce consecutive thread items in registers T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan (with no initial value) ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate); // Inclusive scan in registers with prefix as seed (first thread does not seed) internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); } } //! @rst //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes an array of consecutive input elements. Also provides every thread //! with the block-wide ``block_aggregate`` of all inputs. //! //! - Supports non-commutative scan operators. //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates an inclusive prefix max scan of 128 integer items that //! are partitioned in a :ref:`blocked arrangement ` across 64 threads //! where each thread owns 2 consecutive items. //! //! .. literalinclude:: ../../../cub/test/catch2_test_block_scan_api.cu //! :language: c++ //! :dedent: //! :start-after: example-begin inclusive-scan-array-aggregate-init-value //! :end-before: example-end inclusive-scan-array-aggregate-init-value //! //! The value ``126`` will be stored in ``block_aggregate`` for all threads. //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) //! //! @param[in] initial_value //! Initial value to seed the inclusive scan (uniform across block). It is not taken //! into account for block_aggregate. //! //! @param[in] scan_op //! Binary scan functor //! //! @param[out] block_aggregate //! Block-wide aggregate reduction of input items template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan( T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], T initial_value, ScanOp scan_op, T& block_aggregate) { // Reduce consecutive thread items in registers T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate); // Exclusive scan in registers with prefix as seed internal::ThreadScanInclusive(input, output, scan_op, thread_prefix); } //! @rst //! Computes an inclusive block-wide prefix scan using the specified binary ``scan_op`` functor. //! Each thread contributes an array of consecutive input elements. //! The call-back functor ``block_prefix_callback_op`` is invoked by the first warp in the block, //! and the value returned by *lane*\ :sub:`0` in that warp is used as the "seed" value that logically prefixes the //! thread block's scan inputs. Also provides every thread with the block-wide ``block_aggregate`` of all inputs. //! //! - The ``block_prefix_callback_op`` functor must implement a member function ``T operator()(T block_aggregate)``. //! The functor's input parameter ``block_aggregate`` is the same value also returned by the scan operation. //! The functor will be invoked by the first warp of threads in the block, however only the return value //! from *lane*\ :sub:`0` is applied as the block-wide prefix. Can be stateful. //! - Supports non-commutative scan operators. //! - @blocked //! - @granularity //! - @smemreuse //! //! Snippet //! +++++++ //! //! The code snippet below illustrates a single thread block that progressively //! computes an inclusive prefix max scan over multiple "tiles" of input using a //! prefix functor to maintain a running total between block-wide scans. Each tile consists //! of 128 integer items that are partitioned across 128 threads. //! //! .. code-block:: c++ //! //! #include // or equivalently //! //! // A stateful callback functor that maintains a running prefix to be applied //! // during consecutive scan operations. //! struct BlockPrefixCallbackOp //! { //! // Running prefix //! int running_total; //! //! // Constructor //! __host__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} //! //! // Callback operator to be entered by the first warp of threads in the block. //! // Thread-0 is responsible for returning a value for seeding the block-wide scan. //! __host__ int operator()(int block_aggregate) //! { //! int old_prefix = running_total; //! running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; //! return old_prefix; //! } //! }; //! //! __global__ void ExampleKernel(int *d_data, int num_items, ...) //! { //! // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread //! using BlockLoad = cub::BlockLoad ; //! using BlockStore = cub::BlockStore ; //! using BlockScan = cub::BlockScan ; //! //! // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan //! __shared__ union { //! typename BlockLoad::TempStorage load; //! typename BlockScan::TempStorage scan; //! typename BlockStore::TempStorage store; //! } temp_storage; //! //! // Initialize running total //! BlockPrefixCallbackOp prefix_op(0); //! //! // Have the block iterate over segments of items //! for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) //! { //! // Load a segment of consecutive items that are blocked across threads //! int thread_data[4]; //! BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); //! CTA_SYNC(); //! //! // Collectively compute the block-wide inclusive prefix max scan //! BlockScan(temp_storage.scan).InclusiveScan( //! thread_data, thread_data, cub::Max(), prefix_op); //! CTA_SYNC(); //! //! // Store scanned items to output segment //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); //! CTA_SYNC(); //! } //! //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``. //! The corresponding output for the first segment will be //! ``0, 0, 2, 2, 4, 4, ..., 510, 510``. The output for the second //! segment will be ``512, 512, 514, 514, 516, 516, ..., 1022, 1022``. //! //! @endrst //! //! @tparam ITEMS_PER_THREAD //! **[inferred]** The number of consecutive items partitioned onto each thread. //! //! @tparam ScanOp //! **[inferred]** Binary scan functor type having member `T operator()(const T &a, const T &b)` //! //! @tparam BlockPrefixCallbackOp //! **[inferred]** Call-back functor type having member `T operator()(T block_aggregate)` //! //! @param[in] input //! Calling thread's input items //! //! @param[out] output //! Calling thread's output items (may be aliased to `input`) //! //! @param[in] scan_op //! Binary scan functor //! //! @param[in,out] block_prefix_callback_op //! @rst //! *warp*\ :sub:`0` only call-back functor for specifying a block-wide prefix to be applied to //! the logical input sequence. //! @endrst template _CCCL_DEVICE _CCCL_FORCEINLINE void InclusiveScan( T (&input)[ITEMS_PER_THREAD], T (&output)[ITEMS_PER_THREAD], ScanOp scan_op, BlockPrefixCallbackOp& block_prefix_callback_op) { if (ITEMS_PER_THREAD == 1) { InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op); } else { // Reduce consecutive thread items in registers T thread_prefix = internal::ThreadReduce(input, scan_op); // Exclusive thread block-scan ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op); // Inclusive scan in registers with prefix as seed internal::ThreadScanInclusive(input, output, scan_op, thread_prefix); } } //! @} end member group }; CUB_NAMESPACE_END