//===----------------------------------------------------------------------===// // // Part of libcu++, the C++ Standard Library for your entire system, // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// #ifndef _CUDA_BARRIER #define _CUDA_BARRIER #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #include #include // Forward-declare CUtensorMap for use in cp_async_bulk_tensor_* PTX wrapping // functions. These functions take a pointer to CUtensorMap, so do not need to // know its size. This type is defined in cuda.h (driver API) as: // // typedef struct CUtensorMap_st { [ .. snip .. ] } CUtensorMap; // // We need to forward-declare both CUtensorMap_st (the struct) and CUtensorMap // (the typedef): struct CUtensorMap_st; typedef struct CUtensorMap_st CUtensorMap; _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL // Experimental exposure of TMA PTX: // // - cp_async_bulk_global_to_shared // - cp_async_bulk_shared_to_global // - cp_async_bulk_tensor_{1,2,3,4,5}d_global_to_shared // - cp_async_bulk_tensor_{1,2,3,4,5}d_shared_to_global // - fence_proxy_async_shared_cta // - cp_async_bulk_commit_group // - cp_async_bulk_wait_group_read<0, …, 7> // These PTX wrappers are only available when the code is compiled compute // capability 9.0 and above. The check for (!defined(__CUDA_MINIMUM_ARCH__)) is // necessary to prevent cudafe from ripping out the device functions before // device compilation begins. #ifdef __cccl_lib_experimental_ctk12_cp_async_exposure // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk inline _CCCL_DEVICE void cp_async_bulk_global_to_shared( void* __dest, const void* __src, _CUDA_VSTD::uint32_t __size, ::cuda::barrier<::cuda::thread_scope_block>& __bar) { _LIBCUDACXX_DEBUG_ASSERT(__size % 16 == 0, "Size must be multiple of 16."); _LIBCUDACXX_DEBUG_ASSERT(__isShared(__dest), "Destination must be shared memory address."); _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__src), "Source must be global memory address."); _CUDA_VPTX::cp_async_bulk( _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global, __dest, __src, __size, ::cuda::device::barrier_native_handle(__bar)); } // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk inline _CCCL_DEVICE void cp_async_bulk_shared_to_global(void* __dest, const void* __src, _CUDA_VSTD::uint32_t __size) { _LIBCUDACXX_DEBUG_ASSERT(__size % 16 == 0, "Size must be multiple of 16."); _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__dest), "Destination must be global memory address."); _LIBCUDACXX_DEBUG_ASSERT(__isShared(__src), "Source must be shared memory address."); _CUDA_VPTX::cp_async_bulk(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __dest, __src, __size); } // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor inline _CCCL_DEVICE void cp_async_bulk_tensor_1d_global_to_shared( void* __dest, const CUtensorMap* __tensor_map, int __c0, ::cuda::barrier<::cuda::thread_scope_block>& __bar) { const _CUDA_VSTD::int32_t __coords[]{__c0}; _CUDA_VPTX::cp_async_bulk_tensor( _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global, __dest, __tensor_map, __coords, ::cuda::device::barrier_native_handle(__bar)); } // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor inline _CCCL_DEVICE void cp_async_bulk_tensor_2d_global_to_shared( void* __dest, const CUtensorMap* __tensor_map, int __c0, int __c1, ::cuda::barrier<::cuda::thread_scope_block>& __bar) { const _CUDA_VSTD::int32_t __coords[]{__c0, __c1}; _CUDA_VPTX::cp_async_bulk_tensor( _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global, __dest, __tensor_map, __coords, ::cuda::device::barrier_native_handle(__bar)); } // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor inline _CCCL_DEVICE void cp_async_bulk_tensor_3d_global_to_shared( void* __dest, const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, ::cuda::barrier<::cuda::thread_scope_block>& __bar) { const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2}; _CUDA_VPTX::cp_async_bulk_tensor( _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global, __dest, __tensor_map, __coords, ::cuda::device::barrier_native_handle(__bar)); } // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor inline _CCCL_DEVICE void cp_async_bulk_tensor_4d_global_to_shared( void* __dest, const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, int __c3, ::cuda::barrier<::cuda::thread_scope_block>& __bar) { const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3}; _CUDA_VPTX::cp_async_bulk_tensor( _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global, __dest, __tensor_map, __coords, ::cuda::device::barrier_native_handle(__bar)); } // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor inline _CCCL_DEVICE void cp_async_bulk_tensor_5d_global_to_shared( void* __dest, const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, int __c3, int __c4, ::cuda::barrier<::cuda::thread_scope_block>& __bar) { const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4}; _CUDA_VPTX::cp_async_bulk_tensor( _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global, __dest, __tensor_map, __coords, ::cuda::device::barrier_native_handle(__bar)); } // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor inline _CCCL_DEVICE void cp_async_bulk_tensor_1d_shared_to_global(const CUtensorMap* __tensor_map, int __c0, const void* __src) { const _CUDA_VSTD::int32_t __coords[]{__c0}; _CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src); } // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor inline _CCCL_DEVICE void cp_async_bulk_tensor_2d_shared_to_global(const CUtensorMap* __tensor_map, int __c0, int __c1, const void* __src) { const _CUDA_VSTD::int32_t __coords[]{__c0, __c1}; _CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src); } // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor inline _CCCL_DEVICE void cp_async_bulk_tensor_3d_shared_to_global( const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, const void* __src) { const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2}; _CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src); } // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor inline _CCCL_DEVICE void cp_async_bulk_tensor_4d_shared_to_global( const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, int __c3, const void* __src) { const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3}; _CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src); } // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor inline _CCCL_DEVICE void cp_async_bulk_tensor_5d_shared_to_global( const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, int __c3, int __c4, const void* __src) { const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4}; _CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src); } // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar inline _CCCL_DEVICE void fence_proxy_async_shared_cta() { _CUDA_VPTX::fence_proxy_async(_CUDA_VPTX::space_shared); } // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group inline _CCCL_DEVICE void cp_async_bulk_commit_group() { _CUDA_VPTX::cp_async_bulk_commit_group(); } // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group template inline _CCCL_DEVICE void cp_async_bulk_wait_group_read() { static_assert(__n_prior <= 63, "cp_async_bulk_wait_group_read: waiting for more than 63 groups is not supported."); _CUDA_VPTX::cp_async_bulk_wait_group_read(_CUDA_VPTX::n32_t<__n_prior>{}); } #endif // __cccl_lib_experimental_ctk12_cp_async_exposure _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL #endif // _CUDA_BARRIER