/* * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Sample CUDA application for shared memory bank conflicts. * Transposes a N x N square matrix of float elements in * global memory and generates an output matrix in global memory. * */ #include #include #define DEFAULT_KERNEL_OPTION 1 #define DEFAULT_MATRIX_SIZE 512 #define RUNTIME_API_CALL(apiFuncCall) \ do \ { \ cudaError_t _status = apiFuncCall; \ if (_status != cudaSuccess) \ { \ fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", __FILE__, \ __LINE__, #apiFuncCall, cudaGetErrorString(_status)); \ exit(EXIT_FAILURE); \ } \ } while (0) #define PRINT_PROGRAM_USAGE() \ fprintf(stderr, "Usage: %s [] [] []\n" \ " Default kernel option: %d\n" \ " Use 1 for '%s' and 2 for '%s'\n" \ " Default matrix size: %d\n" \ " Matrix size should be greater than or equal to tile size: %d and" \ " must be an integral multiple of tile size.\n" \ " Default cache config option: none\n" \ " Options: none|shared|l1|equal\n", \ argv[0], DEFAULT_KERNEL_OPTION, \ "transposeCoalesced", "transposeNoBankConflicts", \ DEFAULT_MATRIX_SIZE, TILE_DIM) // Each block transposes a tile of (TILE_DIM x TILE_DIM) elements // using TILE_DIM x BLOCK_ROWS threads, // so that each thread transposes (TILE_DIM / BLOCK_ROWS) elements. // TILE_DIM must be an integral multiple of BLOCK_ROWS #define TILE_DIM 32 #define BLOCK_ROWS 8 // Coalesced global memory transpose with shared memory bank conflicts __global__ void transposeCoalesced(float* odata, float* idata, int width, int height) { __shared__ float tile[TILE_DIM][TILE_DIM]; int xIndex = blockIdx.x * TILE_DIM + threadIdx.x; int yIndex = blockIdx.y * TILE_DIM + threadIdx.y; int indexIn = xIndex + yIndex*width; xIndex = blockIdx.y * TILE_DIM + threadIdx.x; yIndex = blockIdx.x * TILE_DIM + threadIdx.y; int indexOut = xIndex + yIndex*height; for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { tile[threadIdx.y + i][threadIdx.x] = idata[indexIn + i * width]; } __syncthreads(); for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { odata[indexOut + i * height] = tile[threadIdx.x][threadIdx.y + i]; } } // Coalesced global memory transpose with no shared memory bank conflicts __global__ void transposeNoBankConflicts(float* odata, float* idata, int width, int height) { __shared__ float tile[TILE_DIM][TILE_DIM + 1]; int xIndex = blockIdx.x * TILE_DIM + threadIdx.x; int yIndex = blockIdx.y * TILE_DIM + threadIdx.y; int indexIn = xIndex + yIndex*width; xIndex = blockIdx.y * TILE_DIM + threadIdx.x; yIndex = blockIdx.x * TILE_DIM + threadIdx.y; int indexOut = xIndex + yIndex*height; for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { tile[threadIdx.y + i][threadIdx.x] = idata[indexIn + i * width]; } __syncthreads(); for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) { odata[indexOut + i * height] = tile[threadIdx.x][threadIdx.y + i]; } } void computeTransposeGold(float* gold, float* idata, const int size_x, const int size_y) { for (int y = 0; y < size_y; ++y) { for (int x = 0; x < size_x; ++x) { gold[(x * size_y) + y] = idata[(y * size_x) + x]; } } } bool compareData(const float* reference, const float* data, const unsigned int len) { const float epsilon = 0.01f; for (unsigned int i = 0; i < len; ++i) { float diff = reference[i] - data[i]; if ((diff > epsilon) || (diff < -epsilon)) return false; } return true; } int setCacheConfig(const char *cacheConfigStr, const void *kernel) { const int NumCacheConfigs = 4; const char *cacheConfigOptionsStr[NumCacheConfigs] = { "none", "shared", "l1", "equal"}; cudaFuncCache cacheConfigOptions[NumCacheConfigs] = {cudaFuncCachePreferNone, cudaFuncCachePreferShared, cudaFuncCachePreferL1, cudaFuncCachePreferEqual}; cudaFuncCache cacheConfigOption; int i; for(i = 0; i < NumCacheConfigs; i++) { if (strcmp(cacheConfigStr, cacheConfigOptionsStr[i]) == 0) { cacheConfigOption = cacheConfigOptions[i]; break; } } if (i >= NumCacheConfigs) { fprintf(stderr, "Invalid cache config option : '%s'\n", cacheConfigStr); return -1; } fprintf(stderr, "Set cache config option : '%s'\n", cacheConfigStr); RUNTIME_API_CALL(cudaFuncSetCacheConfig(kernel, cacheConfigOption)); return 0; } int main(int argc, char* argv[]) { int kernelOption = DEFAULT_KERNEL_OPTION; if (argc > 1) { kernelOption = atoi(argv[1]); } void (*kernel)(float*, float*, int, int); const char* kernelName; if (kernelOption == 1) { kernel = &transposeCoalesced; kernelName = "transposeCoalesced"; } else if (kernelOption == 2) { kernel = &transposeNoBankConflicts; kernelName = "transposeNoBankConflicts"; } else { fprintf(stderr, "** Invalid kernel option: %s\n", argv[1]); PRINT_PROGRAM_USAGE(); exit(EXIT_FAILURE); } int matrixSize = DEFAULT_MATRIX_SIZE; if (argc > 2) { matrixSize = atoi(argv[2]); } if ((matrixSize < TILE_DIM) || (matrixSize % TILE_DIM != 0)) { fprintf(stderr, "** Invalid matrix size: %s\n", argv[2]); PRINT_PROGRAM_USAGE(); exit(EXIT_FAILURE); } // size of memory required to store the matrix size_t memSize = sizeof(float) * matrixSize * matrixSize; // allocate host memory float* h_idata = (float*)malloc(memSize); float* h_odata = (float*)malloc(memSize); float* transposeGold = (float*)malloc(memSize); // allocate device memory float *d_idata, *d_odata; RUNTIME_API_CALL(cudaMalloc((void**)&d_idata, memSize)); RUNTIME_API_CALL(cudaMalloc((void**)&d_odata, memSize)); // initialize host data for (int i = 0; i < (matrixSize * matrixSize); ++i) { h_idata[i] = (float)i; } // copy host data to device RUNTIME_API_CALL(cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice)); printf("\nmatrix size: %dx%d (%dx%d tiles), kernel name: '%s', " "tile size: %dx%d, block size: %dx%d\n", matrixSize, matrixSize, matrixSize/TILE_DIM, matrixSize/TILE_DIM, kernelName, TILE_DIM, TILE_DIM, TILE_DIM, BLOCK_ROWS); if ((argc > 3) && setCacheConfig(argv[3], (const void *)kernel)) { PRINT_PROGRAM_USAGE(); exit(EXIT_FAILURE); } // execution configuration parameters dim3 grid(matrixSize / TILE_DIM, matrixSize / TILE_DIM); dim3 threads(TILE_DIM, BLOCK_ROWS); kernel<<>>(d_odata, d_idata, matrixSize, matrixSize); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { fprintf(stderr, "Failed to launch '%s' kernel (error code %s)!\n", kernelName, cudaGetErrorString(err)); exit(EXIT_FAILURE); } RUNTIME_API_CALL(cudaMemcpy(h_odata, d_odata, memSize, cudaMemcpyDeviceToHost)); // Compute reference transpose solution computeTransposeGold(transposeGold, h_idata, matrixSize, matrixSize); bool res = compareData(transposeGold, h_odata, matrixSize * matrixSize); if (res == false) { fprintf(stderr, "** '%s' kernel FAILED\n", kernelName); exit(EXIT_FAILURE); } printf("Done\n"); return 0; }