/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/**
* @file
* The cub::BlockDiscontinuity class provides [collective](../index.html#sec0) methods for
* flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
*/
#pragma once
#include
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header
#include
#include
CUB_NAMESPACE_BEGIN
//! @rst
//! The BlockDiscontinuity class provides :ref:`collective ` methods for
//! flagging discontinuities within an ordered set of items partitioned across a CUDA thread
//! block.
//!
//! Overview
//! +++++++++++++++++++++++++++++++++++++++++++++
//!
//! - A set of "head flags" (or "tail flags") is often used to indicate corresponding items
//! that differ from their predecessors (or successors). For example, head flags are convenient
//! for demarcating disjoint data segments as part of a segmented scan or reduction.
//! - @blocked
//!
//! Performance Considerations
//! +++++++++++++++++++++++++++++++++++++++++++++
//!
//! - @granularity
//! - Incurs zero bank conflicts for most types
//!
//! A Simple Example
//! +++++++++++++++++++++++++++++++++++++++++++++
//!
//! @blockcollective{BlockDiscontinuity}
//!
//! The code snippet below illustrates the head flagging of 512 integer items that
//! are partitioned in a :ref:`blocked arrangement ` across 128 threads
//! where each thread owns 4 consecutive items.
//!
//! .. code-block:: c++
//!
//! #include // or equivalently
//!
//! __global__ void ExampleKernel(...)
//! {
//! // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
//! using BlockDiscontinuity = cub::BlockDiscontinuity;
//!
//! // Allocate shared memory for BlockDiscontinuity
//! __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
//!
//! // Obtain a segment of consecutive items that are blocked across threads
//! int thread_data[4];
//! ...
//!
//! // Collectively compute head flags for discontinuities in the segment
//! int head_flags[4];
//! BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
//!
//! Suppose the set of input ``thread_data`` across the block of threads is
//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``.
//! The corresponding output ``head_flags`` in those threads will be
//! ``{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``.
//!
//! Re-using dynamically allocating shared memory
//! +++++++++++++++++++++++++++++++++++++++++++++
//!
//! The ``examples/block/example_block_reduce_dyn_smem.cu`` example illustrates usage of
//! dynamically shared memory with BlockReduce and how to re-purpose the same memory region.
//! This example can be easily adapted to the storage required by BlockDiscontinuity.
//! @endrst
//!
//! @tparam T
//! The data type to be flagged.
//!
//! @tparam BLOCK_DIM_X
//! The thread block length in threads along the X dimension
//!
//! @tparam BLOCK_DIM_Y
//! **[optional]** The thread block length in threads along the Y dimension (default: 1)
//!
//! @tparam BLOCK_DIM_Z
//! **[optional]** The thread block length in threads along the Z dimension (default: 1)
//!
//! @tparam LEGACY_PTX_ARCH
//! **[optional]** Unused
template
class BlockDiscontinuity
{
private:
enum
{
/// The thread block size in threads
BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
};
/// Shared memory storage layout type (last element from each thread's input)
struct _TempStorage
{
T first_items[BLOCK_THREADS];
T last_items[BLOCK_THREADS];
};
/// Internal storage allocator
_CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
{
__shared__ _TempStorage private_storage;
return private_storage;
}
/// Specialization for when FlagOp has third index param
template ::value>
struct ApplyOp
{
// Apply flag operator
static _CCCL_DEVICE _CCCL_FORCEINLINE bool FlagT(FlagOp flag_op, const T& a, const T& b, int idx)
{
return flag_op(a, b, idx);
}
};
/// Specialization for when FlagOp does not have a third index param
template
struct ApplyOp
{
// Apply flag operator
static _CCCL_DEVICE _CCCL_FORCEINLINE bool FlagT(FlagOp flag_op, const T& a, const T& b, int /*idx*/)
{
return flag_op(a, b);
}
};
/// Templated unrolling of item comparison (inductive case)
struct Iterate
{
/**
* @brief Head flags
*
* @param[out] flags
* Calling thread's discontinuity head_flags
*
* @param[in] input
* Calling thread's input items
*
* @param[out] preds
* Calling thread's predecessor items
*
* @param[in] flag_op
* Binary boolean flag predicate
*/
template
static _CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeads(
int linear_tid,
FlagT (&flags)[ITEMS_PER_THREAD],
T (&input)[ITEMS_PER_THREAD],
T (&preds)[ITEMS_PER_THREAD],
FlagOp flag_op)
{
#pragma unroll
for (int i = 1; i < ITEMS_PER_THREAD; ++i)
{
preds[i] = input[i - 1];
flags[i] = ApplyOp::FlagT(flag_op, preds[i], input[i], (linear_tid * ITEMS_PER_THREAD) + i);
}
}
/**
* @brief Tail flags
*
* @param[out] flags
* Calling thread's discontinuity head_flags
*
* @param[in] input
* Calling thread's input items
*
* @param[in] flag_op
* Binary boolean flag predicate
*/
template
static _CCCL_DEVICE _CCCL_FORCEINLINE void
FlagTails(int linear_tid, FlagT (&flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], FlagOp flag_op)
{
#pragma unroll
for (int i = 0; i < ITEMS_PER_THREAD - 1; ++i)
{
flags[i] = ApplyOp::FlagT(flag_op, input[i], input[i + 1], (linear_tid * ITEMS_PER_THREAD) + i + 1);
}
}
};
/******************************************************************************
* Thread fields
******************************************************************************/
/// Shared storage reference
_TempStorage& temp_storage;
/// Linear thread-id
unsigned int linear_tid;
public:
/// @smemstorage{BlockDiscontinuity}
struct TempStorage : Uninitialized<_TempStorage>
{};
//! @name Collective constructors
//! @{
/**
* @brief Collective constructor using a private static allocation of shared memory as temporary
* storage.
*/
_CCCL_DEVICE _CCCL_FORCEINLINE BlockDiscontinuity()
: temp_storage(PrivateStorage())
, linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{}
/**
* @brief Collective constructor using the specified memory allocation as temporary storage.
*
* @param[in] temp_storage
* Reference to memory allocation having layout type TempStorage
*/
_CCCL_DEVICE _CCCL_FORCEINLINE BlockDiscontinuity(TempStorage& temp_storage)
: temp_storage(temp_storage.Alias())
, linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
{}
//! @} end member group
//! @name Head flag operations
//! @{
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
/**
* @param[out] head_flags
* Calling thread's discontinuity head_flags
*
* @param[in] input
* Calling thread's input items
*
* @param[out] preds
* Calling thread's predecessor items
*
* @param[in] flag_op
* Binary boolean flag predicate
*/
template
_CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeads(
FlagT (&head_flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], T (&preds)[ITEMS_PER_THREAD], FlagOp flag_op)
{
// Share last item
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
CTA_SYNC();
if (linear_tid == 0)
{
// Set flag for first thread-item (preds[0] is undefined)
head_flags[0] = 1;
}
else
{
preds[0] = temp_storage.last_items[linear_tid - 1];
head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
}
// Set head_flags for remaining items
Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
}
/**
* @param[out] head_flags
* Calling thread's discontinuity head_flags
*
* @param[in] input
* Calling thread's input items
*
* @param[out] preds
* Calling thread's predecessor items
*
* @param[in] flag_op
* Binary boolean flag predicate
*
* @param[in] tile_predecessor_item
* [thread0 only] Item with which to compare the first tile item
* (input0 from thread0).
*/
template
_CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeads(
FlagT (&head_flags)[ITEMS_PER_THREAD],
T (&input)[ITEMS_PER_THREAD],
T (&preds)[ITEMS_PER_THREAD],
FlagOp flag_op,
T tile_predecessor_item)
{
// Share last item
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
CTA_SYNC();
// Set flag for first thread-item
preds[0] = (linear_tid == 0) ? tile_predecessor_item : // First thread
temp_storage.last_items[linear_tid - 1];
head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
// Set head_flags for remaining items
Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
}
#endif // DOXYGEN_SHOULD_SKIP_THIS
//! @rst
//! Sets head flags indicating discontinuities between items partitioned across the thread
//! block, for which the first item has no reference and is always flagged.
//!
//! - The flag ``head_flags[i]`` is set for item ``input[i]`` when ``flag_op(previous-item, input[i])`` returns
//! ``true`` (where ``previous-item`` is either the preceding item in the same thread or the last item in
//! the previous thread).
//! - For *thread*\ :sub:`0`, item ``input[0]`` is always flagged.
//! - @blocked
//! - @granularity
//! - @smemreuse
//!
//! Snippet
//! +++++++
//!
//! The code snippet below illustrates the head-flagging of 512 integer items that
//! are partitioned in a :ref:`blocked arrangement ` across 128 threads
//! where each thread owns 4 consecutive items.
//!
//! .. code-block:: c++
//!
//! #include // or equivalently
//!
//! __global__ void ExampleKernel(...)
//! {
//! // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
//! using BlockDiscontinuity = cub::BlockDiscontinuity;
//!
//! // Allocate shared memory for BlockDiscontinuity
//! __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
//!
//! // Obtain a segment of consecutive items that are blocked across threads
//! int thread_data[4];
//! ...
//!
//! // Collectively compute head flags for discontinuities in the segment
//! int head_flags[4];
//! BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
//!
//! Suppose the set of input ``thread_data`` across the block of threads is
//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``.
//! The corresponding output ``head_flags`` in those threads will be
//! ``{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``.
//! @endrst
//!
//! @tparam ITEMS_PER_THREAD
//! **[inferred]** The number of consecutive items partitioned onto each thread
//!
//! @tparam FlagT
//! **[inferred]** The flag type (must be an integer type)
//!
//! @tparam FlagOp
//! **[inferred]** Binary predicate functor type having member
//! `T operator()(const T &a, const T &b)` or member
//! `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
//! if a discontinuity exists between `a` and `b`, otherwise `false`.
//! `b_index` is the rank of b in the aggregate tile of data.
//!
//! @param[out] head_flags
//! Calling thread's discontinuity head_flags
//!
//! @param[in] input
//! Calling thread's input items
//!
//! @param[in] flag_op
//! Binary boolean flag predicate
template
_CCCL_DEVICE _CCCL_FORCEINLINE void
FlagHeads(FlagT (&head_flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], FlagOp flag_op)
{
T preds[ITEMS_PER_THREAD];
FlagHeads(head_flags, input, preds, flag_op);
}
//! @rst
//! Sets head flags indicating discontinuities between items partitioned across the thread block.
//!
//! - The flag ``head_flags[i]`` is set for item ``input[i]`` when ``flag_op(previous-item, input[i])``
//! returns ``true`` (where ``previous-item`` is either the preceding item in the same thread or the last item
//! in the previous thread).
//! - For *thread*\ :sub:`0`, item ``input[0]`` is compared against ``tile_predecessor_item``.
//! - @blocked
//! - @granularity
//! - @smemreuse
//!
//! Snippet
//! +++++++
//!
//! The code snippet below illustrates the head-flagging of 512 integer items that
//! are partitioned in a :ref:`blocked arrangement ` across 128 threads
//! where each thread owns 4 consecutive items.
//!
//! .. code-block:: c++
//!
//! #include // or equivalently
//!
//! __global__ void ExampleKernel(...)
//! {
//! // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
//! using BlockDiscontinuity = cub::BlockDiscontinuity;
//!
//! // Allocate shared memory for BlockDiscontinuity
//! __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
//!
//! // Obtain a segment of consecutive items that are blocked across threads
//! int thread_data[4];
//! ...
//!
//! // Have thread0 obtain the predecessor item for the entire tile
//! int tile_predecessor_item;
//! if (threadIdx.x == 0) tile_predecessor_item == ...
//!
//! // Collectively compute head flags for discontinuities in the segment
//! int head_flags[4];
//! BlockDiscontinuity(temp_storage).FlagHeads(
//! head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
//!
//! Suppose the set of input ``thread_data`` across the block of threads is
//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``,
//! and that ``tile_predecessor_item`` is ``0``. The corresponding output ``head_flags`` in those
//! threads will be ``{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``.
//! @endrst
//!
//! @tparam ITEMS_PER_THREAD
//! **[inferred]** The number of consecutive items partitioned onto each thread.
//!
//! @tparam FlagT
//! **[inferred]** The flag type (must be an integer type)
//!
//! @tparam FlagOp
//! **[inferred]** Binary predicate functor type having member
//! `T operator()(const T &a, const T &b)` or member
//! `T operator()(const T &a, const T &b, unsigned int b_index)`,
//! and returning `true` if a discontinuity exists between `a` and `b`,
//! otherwise `false`. `b_index` is the rank of b in the aggregate tile of data.
//!
//! @param[out] head_flags
//! Calling thread's discontinuity `head_flags`
//!
//! @param[in] input
//! Calling thread's input items
//!
//! @param[in] flag_op
//! Binary boolean flag predicate
//!
//! @param[in] tile_predecessor_item
//! @rst
//! *thread*\ :sub:`0` only item with which to compare the first tile item (``input[0]`` from *thread*\ :sub:`0`).
//! @endrst
template
_CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeads(
FlagT (&head_flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], FlagOp flag_op, T tile_predecessor_item)
{
T preds[ITEMS_PER_THREAD];
FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
}
//! @} end member group
//! @name Tail flag operations
//! @{
//! @rst
//! Sets tail flags indicating discontinuities between items partitioned across the thread
//! block, for which the last item has no reference and is always flagged.
//!
//! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when
//! ``flag_op(input[i], next-item)``
//! returns ``true`` (where `next-item` is either the next item
//! in the same thread or the first item in the next thread).
//! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item ``input[ITEMS_PER_THREAD - 1]`` is always flagged.
//! - @blocked
//! - @granularity
//! - @smemreuse
//!
//! Snippet
//! +++++++
//!
//! The code snippet below illustrates the tail-flagging of 512 integer items that
//! are partitioned in a :ref:`blocked arrangement ` across 128 threads
//! where each thread owns 4 consecutive items.
//!
//! .. code-block:: c++
//!
//! #include // or equivalently
//!
//! __global__ void ExampleKernel(...)
//! {
//! // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
//! using BlockDiscontinuity = cub::BlockDiscontinuity;
//!
//! // Allocate shared memory for BlockDiscontinuity
//! __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
//!
//! // Obtain a segment of consecutive items that are blocked across threads
//! int thread_data[4];
//! ...
//!
//! // Collectively compute tail flags for discontinuities in the segment
//! int tail_flags[4];
//! BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
//!
//! Suppose the set of input ``thread_data`` across the block of threads is
//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``.
//! The corresponding output ``tail_flags`` in those threads will be
//! ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }``.
//! @endrst
//!
//! @tparam ITEMS_PER_THREAD
//! **[inferred]** The number of consecutive items partitioned onto each thread.
//!
//! @tparam FlagT
//! **[inferred]** The flag type (must be an integer type)
//!
//! @tparam FlagOp
//! **[inferred]** Binary predicate functor type having member
//! `T operator()(const T &a, const T &b)` or member
//! `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
//! if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the
//! rank of `b` in the aggregate tile of data.
//!
//! @param[out] tail_flags
//! Calling thread's discontinuity tail_flags
//!
//! @param[in] input
//! Calling thread's input items
//!
//! @param[in] flag_op
//! Binary boolean flag predicate
template
_CCCL_DEVICE _CCCL_FORCEINLINE void
FlagTails(FlagT (&tail_flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], FlagOp flag_op)
{
// Share first item
temp_storage.first_items[linear_tid] = input[0];
CTA_SYNC();
// Set flag for last thread-item
tail_flags[ITEMS_PER_THREAD - 1] =
(linear_tid == BLOCK_THREADS - 1) ? 1 : // Last thread
ApplyOp::FlagT(
flag_op,
input[ITEMS_PER_THREAD - 1],
temp_storage.first_items[linear_tid + 1],
(linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
// Set tail_flags for remaining items
Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
}
//! @rst
//! Sets tail flags indicating discontinuities between items partitioned across the thread block.
//!
//! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when ``flag_op(input[i], next-item)``
//! returns ``true`` (where ``next-item`` is either the next item in the same thread or the first item in
//! the next thread).
//! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item ``input[ITEMS_PER_THREAD - 1]`` is compared against
//! ``tile_successor_item``.
//! - @blocked
//! - @granularity
//! - @smemreuse
//!
//! Snippet
//! +++++++
//!
//! The code snippet below illustrates the tail-flagging of 512 integer items that
//! are partitioned in a :ref:`blocked arrangement ` across 128 threads
//! where each thread owns 4 consecutive items.
//!
//! .. code-block:: c++
//!
//! #include // or equivalently
//!
//! __global__ void ExampleKernel(...)
//! {
//! // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
//! using BlockDiscontinuity = cub::BlockDiscontinuity;
//!
//! // Allocate shared memory for BlockDiscontinuity
//! __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
//!
//! // Obtain a segment of consecutive items that are blocked across threads
//! int thread_data[4];
//! ...
//!
//! // Have thread127 obtain the successor item for the entire tile
//! int tile_successor_item;
//! if (threadIdx.x == 127) tile_successor_item == ...
//!
//! // Collectively compute tail flags for discontinuities in the segment
//! int tail_flags[4];
//! BlockDiscontinuity(temp_storage).FlagTails(
//! tail_flags, thread_data, cub::Inequality(), tile_successor_item);
//!
//! Suppose the set of input ``thread_data`` across the block of threads is
//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
//! and that ``tile_successor_item`` is ``125``. The corresponding output ``tail_flags`` in those
//! threads will be ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }``.
//! @endrst
//!
//! @tparam ITEMS_PER_THREAD
//! **[inferred]** The number of consecutive items partitioned onto each thread.
//!
//! @tparam FlagT
//! **[inferred]** The flag type (must be an integer type)
//!
//! @tparam FlagOp
//! **[inferred]** Binary predicate functor type having member
//! `T operator()(const T &a, const T &b)` or member
//! `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
//! if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the
//! rank of `b` in the aggregate tile of data.
//!
//! @param[out] tail_flags
//! Calling thread's discontinuity tail_flags
//!
//! @param[in] input
//! Calling thread's input items
//!
//! @param[in] flag_op
//! Binary boolean flag predicate
//!
//! @param[in] tile_successor_item
//! @rst
//! *thread*\ :sub:`BLOCK_THREADS - 1` only item with which to
//! compare the last tile item (``input[ITEMS_PER_THREAD - 1]`` from
//! *thread*\ :sub:`BLOCK_THREADS - 1`).
//! @endrst
template
_CCCL_DEVICE _CCCL_FORCEINLINE void
FlagTails(FlagT (&tail_flags)[ITEMS_PER_THREAD], T (&input)[ITEMS_PER_THREAD], FlagOp flag_op, T tile_successor_item)
{
// Share first item
temp_storage.first_items[linear_tid] = input[0];
CTA_SYNC();
// Set flag for last thread-item
T successor_item = (linear_tid == BLOCK_THREADS - 1) ? tile_successor_item : // Last thread
temp_storage.first_items[linear_tid + 1];
tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT(
flag_op, input[ITEMS_PER_THREAD - 1], successor_item, (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
// Set tail_flags for remaining items
Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
}
//! @} end member group
//! @name Head & tail flag operations
//! @{
//! @rst
//! Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
//!
//! - The flag ``head_flags[i]`` is set for item ``input[i]`` when ``flag_op(previous-item, input[i])`` returns
//! ``true`` (where ``previous-item`` is either the preceding item in the same thread or the last item in
//! the previous thread).
//! - For *thread*\ :sub:`0`, item ``input[0]`` is always flagged.
//! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when ``flag_op(input[i], next-item)``
//! returns ``true`` (where next-item is either the next item in the same thread or the first item in
//! the next thread).
//! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item ``input[ITEMS_PER_THREAD - 1]`` is always flagged.
//! - @blocked
//! - @granularity
//! - @smemreuse
//!
//! Snippet
//! +++++++
//!
//! The code snippet below illustrates the head- and tail-flagging of 512 integer items that
//! are partitioned in a :ref:`blocked arrangement ` across 128 threads
//! where each thread owns 4 consecutive items.
//!
//! .. code-block:: c++
//!
//! #include // or equivalently
//!
//! __global__ void ExampleKernel(...)
//! {
//! // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
//! using BlockDiscontinuity = cub::BlockDiscontinuity;
//!
//! // Allocate shared memory for BlockDiscontinuity
//! __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
//!
//! // Obtain a segment of consecutive items that are blocked across threads
//! int thread_data[4];
//! ...
//!
//! // Collectively compute head and flags for discontinuities in the segment
//! int head_flags[4];
//! int tail_flags[4];
//! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
//! head_flags, tail_flags, thread_data, cub::Inequality());
//!
//! Suppose the set of input ``thread_data`` across the block of threads is
//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
//! and that the tile_successor_item is ``125``. The corresponding output ``head_flags``
//! in those threads will be ``{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``.
//! and the corresponding output ``tail_flags`` in those threads will be
//! ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }``.
//! @endrst
//!
//! @tparam ITEMS_PER_THREAD
//! **[inferred]** The number of consecutive items partitioned onto each thread.
//!
//! @tparam FlagT
//! **[inferred]** The flag type (must be an integer type)
//!
//! @tparam FlagOp
//! **[inferred]** Binary predicate functor type having member
//! `T operator()(const T &a, const T &b)` or member
//! `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
//! if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the
//! rank of `b` in the aggregate tile of data.
//!
//! @param[out] head_flags
//! Calling thread's discontinuity head_flags
//!
//! @param[out] tail_flags
//! Calling thread's discontinuity tail_flags
//!
//! @param[in] input
//! Calling thread's input items
//!
//! @param[in] flag_op
//! Binary boolean flag predicate
template
_CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeadsAndTails(
FlagT (&head_flags)[ITEMS_PER_THREAD],
FlagT (&tail_flags)[ITEMS_PER_THREAD],
T (&input)[ITEMS_PER_THREAD],
FlagOp flag_op)
{
// Share first and last items
temp_storage.first_items[linear_tid] = input[0];
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
CTA_SYNC();
T preds[ITEMS_PER_THREAD];
// Set flag for first thread-item
if (linear_tid == 0)
{
head_flags[0] = 1;
}
else
{
preds[0] = temp_storage.last_items[linear_tid - 1];
head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
}
// Set flag for last thread-item
tail_flags[ITEMS_PER_THREAD - 1] =
(linear_tid == BLOCK_THREADS - 1) ? 1 : // Last thread
ApplyOp::FlagT(
flag_op,
input[ITEMS_PER_THREAD - 1],
temp_storage.first_items[linear_tid + 1],
(linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
// Set head_flags for remaining items
Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
// Set tail_flags for remaining items
Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
}
//! @rst
//! Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
//!
//! - The flag ``head_flags[i]`` is set for item ``input[i]`` when
//! ``flag_op(previous-item, input[i])`` returns ``true`` (where ``previous-item`` is either the preceding item
//! in the same thread or the last item in the previous thread).
//! - For *thread*\ :sub:`0`, item ``input[0]`` is always flagged.
//! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when ``flag_op(input[i], next-item)`` returns ``true``
//! (where ``next-item`` is either the next item in the same thread or the first item in the next thread).
//! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item ``input[ITEMS_PER_THREAD - 1]`` is compared
//! against ``tile_predecessor_item``.
//! - @blocked
//! - @granularity
//! - @smemreuse
//!
//! Snippet
//! +++++++
//!
//! The code snippet below illustrates the head- and tail-flagging of 512 integer items that
//! are partitioned in a :ref:`blocked arrangement ` across 128 threads
//! where each thread owns 4 consecutive items.
//!
//! .. code-block:: c++
//!
//! #include // or equivalently
//!
//! __global__ void ExampleKernel(...)
//! {
//! // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
//! using BlockDiscontinuity = cub::BlockDiscontinuity;
//!
//! // Allocate shared memory for BlockDiscontinuity
//! __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
//!
//! // Obtain a segment of consecutive items that are blocked across threads
//! int thread_data[4];
//! ...
//!
//! // Have thread127 obtain the successor item for the entire tile
//! int tile_successor_item;
//! if (threadIdx.x == 127) tile_successor_item == ...
//!
//! // Collectively compute head and flags for discontinuities in the segment
//! int head_flags[4];
//! int tail_flags[4];
//! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
//! head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality());
//!
//! Suppose the set of input ``thread_data`` across the block of threads is
//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
//! and that the tile_successor_item is ``125``. The corresponding output ``head_flags``
//! in those threads will be ``{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``.
//! and the corresponding output ``tail_flags`` in those threads will be
//! ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }``.
//! @endrst
//!
//! @tparam ITEMS_PER_THREAD
//! **[inferred]** The number of consecutive items partitioned onto each thread.
//!
//! @tparam FlagT
//! **[inferred]** The flag type (must be an integer type)
//!
//! @tparam FlagOp
//! **[inferred]** Binary predicate functor type having member
//! `T operator()(const T &a, const T &b)` or member
//! `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
//! if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the
//! rank of b in the aggregate tile of data.
//!
//! @param[out] head_flags
//! Calling thread's discontinuity head_flags
//!
//! @param[out] tail_flags
//! Calling thread's discontinuity tail_flags
//!
//! @param[in] tile_successor_item
//! @rst
//! *thread*\ :sub:`BLOCK_THREADS - 1` only item with which to compare
//! the last tile item (``input[ITEMS_PER_THREAD - 1]`` from
//! *thread*\ :sub:`BLOCK_THREADS - 1`).
//! @endrst
//!
//! @param[in] input
//! Calling thread's input items
//!
//! @param[in] flag_op
//! Binary boolean flag predicate
template
_CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeadsAndTails(
FlagT (&head_flags)[ITEMS_PER_THREAD],
FlagT (&tail_flags)[ITEMS_PER_THREAD],
T tile_successor_item,
T (&input)[ITEMS_PER_THREAD],
FlagOp flag_op)
{
// Share first and last items
temp_storage.first_items[linear_tid] = input[0];
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
CTA_SYNC();
T preds[ITEMS_PER_THREAD];
// Set flag for first thread-item
if (linear_tid == 0)
{
head_flags[0] = 1;
}
else
{
preds[0] = temp_storage.last_items[linear_tid - 1];
head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
}
// Set flag for last thread-item
T successor_item = (linear_tid == BLOCK_THREADS - 1) ? tile_successor_item : // Last thread
temp_storage.first_items[linear_tid + 1];
tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT(
flag_op, input[ITEMS_PER_THREAD - 1], successor_item, (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
// Set head_flags for remaining items
Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
// Set tail_flags for remaining items
Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
}
//! @rst
//! Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
//!
//! - The flag ``head_flags[i]`` is set for item ``input[i]`` when ``flag_op(previous-item, input[i])``
//! returns ``true`` (where ``previous-item`` is either the preceding item in the same thread or the last item
//! in the previous thread).
//! - For *thread*\ :sub:`0`, item ``input[0]`` is compared against ``tile_predecessor_item``.
//! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when
//! ``flag_op(input[i], next-item)`` returns ``true`` (where ``next-item`` is either the next item
//! in the same thread or the first item in the next thread).
//! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item
//! ``input[ITEMS_PER_THREAD - 1]`` is always flagged.
//! - @blocked
//! - @granularity
//! - @smemreuse
//!
//! Snippet
//! +++++++
//!
//! The code snippet below illustrates the head- and tail-flagging of 512 integer items that
//! are partitioned in a :ref:`blocked arrangement ` across 128 threads
//! where each thread owns 4 consecutive items.
//!
//! .. code-block:: c++
//!
//! #include // or equivalently
//!
//! __global__ void ExampleKernel(...)
//! {
//! // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
//! using BlockDiscontinuity = cub::BlockDiscontinuity;
//!
//! // Allocate shared memory for BlockDiscontinuity
//! __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
//!
//! // Obtain a segment of consecutive items that are blocked across threads
//! int thread_data[4];
//! ...
//!
//! // Have thread0 obtain the predecessor item for the entire tile
//! int tile_predecessor_item;
//! if (threadIdx.x == 0) tile_predecessor_item == ...
//!
//! // Have thread127 obtain the successor item for the entire tile
//! int tile_successor_item;
//! if (threadIdx.x == 127) tile_successor_item == ...
//!
//! // Collectively compute head and flags for discontinuities in the segment
//! int head_flags[4];
//! int tail_flags[4];
//! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
//! head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
//! thread_data, cub::Inequality());
//!
//! Suppose the set of input ``thread_data`` across the block of threads is
//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``,
//! that the ``tile_predecessor_item`` is ``0``, and that the ``tile_successor_item`` is ``125``.
//! The corresponding output ``head_flags`` in those threads will be
//! ``{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``, and the corresponding output ``tail_flags``
//! in those threads will be ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }``.
//! @endrst
//!
//! @tparam ITEMS_PER_THREAD
//! **[inferred]** The number of consecutive items partitioned onto each thread.
//!
//! @tparam FlagT
//! **[inferred]** The flag type (must be an integer type)
//!
//! @tparam FlagOp
//! **[inferred]** Binary predicate functor type having member
//! `T operator()(const T &a, const T &b)` or member
//! `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
//! if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the rank
//! of b in the aggregate tile of data.
//!
//! @param[out] head_flags
//! Calling thread's discontinuity head_flags
//!
//! @param[in] tile_predecessor_item
//! @rst
//! *thread*\ :sub:`0` only item with which to compare the first tile item (``input[0]`` from *thread*\ :sub:`0`).
//! @endrst
//!
//! @param[out] tail_flags
//! Calling thread's discontinuity tail_flags
//!
//! @param[in] input
//! Calling thread's input items
//!
//! @param[in] flag_op
//! Binary boolean flag predicate
template
_CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeadsAndTails(
FlagT (&head_flags)[ITEMS_PER_THREAD],
T tile_predecessor_item,
FlagT (&tail_flags)[ITEMS_PER_THREAD],
T (&input)[ITEMS_PER_THREAD],
FlagOp flag_op)
{
// Share first and last items
temp_storage.first_items[linear_tid] = input[0];
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
CTA_SYNC();
T preds[ITEMS_PER_THREAD];
// Set flag for first thread-item
preds[0] = (linear_tid == 0) ? tile_predecessor_item : // First thread
temp_storage.last_items[linear_tid - 1];
head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
// Set flag for last thread-item
tail_flags[ITEMS_PER_THREAD - 1] =
(linear_tid == BLOCK_THREADS - 1) ? 1 : // Last thread
ApplyOp::FlagT(
flag_op,
input[ITEMS_PER_THREAD - 1],
temp_storage.first_items[linear_tid + 1],
(linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
// Set head_flags for remaining items
Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
// Set tail_flags for remaining items
Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
}
//! @rst
//! Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
//!
//! - The flag ``head_flags[i]`` is set for item ``input[i]`` when ``flag_op(previous-item, input[i])``
//! returns ``true`` (where ``previous-item`` is either the preceding item in the same thread or the last item in
//! the previous thread).
//! - For *thread*\ :sub:`0`, item ``input[0]`` is compared against ``tile_predecessor_item``.
//! - The flag ``tail_flags[i]`` is set for item ``input[i]`` when ``flag_op(input[i], next-item)``
//! returns ``true`` (where ``next-item`` is either the next item in the same thread or the first item in
//! the next thread).
//! - For *thread*\ :sub:`BLOCK_THREADS - 1`, item ``input[ITEMS_PER_THREAD - 1]`` is compared
//! against ``tile_successor_item``.
//! - @blocked
//! - @granularity
//! - @smemreuse
//!
//! Snippet
//! +++++++
//!
//! The code snippet below illustrates the head- and tail-flagging of 512 integer items that
//! are partitioned in a :ref:`blocked arrangement ` across 128 threads
//! where each thread owns 4 consecutive items.
//!
//! .. code-block:: c++
//!
//! #include // or equivalently
//!
//! __global__ void ExampleKernel(...)
//! {
//! // Specialize BlockDiscontinuity for a 1D block of 128 threads of type int
//! using BlockDiscontinuity = cub::BlockDiscontinuity;
//!
//! // Allocate shared memory for BlockDiscontinuity
//! __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
//!
//! // Obtain a segment of consecutive items that are blocked across threads
//! int thread_data[4];
//! ...
//!
//! // Have thread0 obtain the predecessor item for the entire tile
//! int tile_predecessor_item;
//! if (threadIdx.x == 0) tile_predecessor_item == ...
//!
//! // Have thread127 obtain the successor item for the entire tile
//! int tile_successor_item;
//! if (threadIdx.x == 127) tile_successor_item == ...
//!
//! // Collectively compute head and flags for discontinuities in the segment
//! int head_flags[4];
//! int tail_flags[4];
//! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
//! head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
//! thread_data, cub::Inequality());
//!
//! Suppose the set of input ``thread_data`` across the block of threads is
//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``,
//! that the ``tile_predecessor_item`` is ``0``, and that the
//! ``tile_successor_item`` is ``125``. The corresponding output ``head_flags``
//! in those threads will be ``{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }``.
//! and the corresponding output ``tail_flags`` in those threads will be
//! ``{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }``.
//! @endrst
//!
//! @tparam ITEMS_PER_THREAD
//! **[inferred]** The number of consecutive items partitioned onto each thread.
//!
//! @tparam FlagT
//! **[inferred]** The flag type (must be an integer type)
//!
//! @tparam FlagOp
//! **[inferred]** Binary predicate functor type having member
//! `T operator()(const T &a, const T &b)` or member
//! `T operator()(const T &a, const T &b, unsigned int b_index)`, and returning `true`
//! if a discontinuity exists between `a` and `b`, otherwise `false`. `b_index` is the rank
//! of `b` in the aggregate tile of data.
//!
//! @param[out] head_flags
//! Calling thread's discontinuity head_flags
//!
//! @param[in] tile_predecessor_item
//! @rst
//! *thread*\ :sub:`0` only item with which to compare the first tile item (``input[0]`` from *thread*\ :sub:`0`).
//! @endrst
//!
//! @param[out] tail_flags
//! Calling thread's discontinuity tail_flags
//!
//! @param[in] tile_successor_item
//! @rst
//! *thread*\ :sub:`BLOCK_THREADS - 1` only item with which to compare the last tile item
//! (``input[ITEMS_PER_THREAD - 1]`` from *thread*\ :sub:`BLOCK_THREADS - 1`).
//! @endrst
//!
//! @param[in] input
//! Calling thread's input items
//!
//! @param[in] flag_op
//! Binary boolean flag predicate
template
_CCCL_DEVICE _CCCL_FORCEINLINE void FlagHeadsAndTails(
FlagT (&head_flags)[ITEMS_PER_THREAD],
T tile_predecessor_item,
FlagT (&tail_flags)[ITEMS_PER_THREAD],
T tile_successor_item,
T (&input)[ITEMS_PER_THREAD],
FlagOp flag_op)
{
// Share first and last items
temp_storage.first_items[linear_tid] = input[0];
temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
CTA_SYNC();
T preds[ITEMS_PER_THREAD];
// Set flag for first thread-item
preds[0] = (linear_tid == 0) ? tile_predecessor_item : // First thread
temp_storage.last_items[linear_tid - 1];
head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
// Set flag for last thread-item
T successor_item = (linear_tid == BLOCK_THREADS - 1) ? tile_successor_item : // Last thread
temp_storage.first_items[linear_tid + 1];
tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT(
flag_op, input[ITEMS_PER_THREAD - 1], successor_item, (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
// Set head_flags for remaining items
Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
// Set tail_flags for remaining items
Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
}
//! @} end member group
};
CUB_NAMESPACE_END