/* * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Sample CUDA application for uncoalesced global memory accesses. * Adds a floating point constant to an input array of double3 elements in * global memory and generates an output array of double3 in global memory. */ #include #include #define BLOCK_SIZE 256 #define RUNTIME_API_CALL(apiFuncCall) \ do { \ cudaError_t _status = apiFuncCall; \ if (_status != cudaSuccess) { \ fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ __FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\ exit(EXIT_FAILURE); \ } \ } while (0) __global__ void addConstDouble3(int numElements, double3 *d_in, double k, double3 *d_out) { int index = blockIdx.x * blockDim.x + threadIdx.x; if (index < numElements) { double3 a = d_in[index]; a.x += k; a.y += k; a.z += k; d_out[index] = a; } } __global__ void addConstDouble(int numElements, double *d_in, double k, double *d_out) { int index = blockIdx.x * blockDim.x + threadIdx.x; if (index < numElements) { d_out[index] = d_in[index] + k; } } int main (int argc, char *argv[]) { // Error code to check return values for CUDA calls cudaError_t err = cudaSuccess; double constK = 10.0; int kernelOption = 0; if (argc > 1) { kernelOption = atoi(argv[1]); } int numElements = 1024*1024; if (argc > 2) { numElements = atoi(argv[2]); if (numElements <= 0) { fprintf(stderr, "Invalid number of elements(%s), should be a positive number\n", argv[2]); exit(EXIT_FAILURE); } } printf("double3 constant addition of %d elements\n", numElements); printf("kernelOption=%d\n", kernelOption); size_t size = numElements * sizeof(double3); // Allocate the host input array double3 *h_A = (double3 *)malloc(size); // Allocate the host output array double3 *h_B = (double3 *)malloc(size); // Verify that allocations succeeded if (h_A == NULL || h_B == NULL) { fprintf(stderr, "Failed to allocate host arrays!\n"); exit(EXIT_FAILURE); } // Initialize the host input vectors for (int i = 0; i < numElements; ++i) { h_A[i].x = rand()/(double)RAND_MAX; h_A[i].y = rand()/(double)RAND_MAX; h_A[i].z = rand()/(double)RAND_MAX; } // Allocate the device input array A double3 *d_A = NULL; RUNTIME_API_CALL(cudaMalloc((void **)&d_A, size)); // Allocate the device output array B double3 *d_B = NULL; RUNTIME_API_CALL(cudaMalloc((void **)&d_B, size)); // Copy the host input array A in host memory to the device input array in device memory RUNTIME_API_CALL(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice)); // Launch the CUDA Kernel int threadsPerBlock = BLOCK_SIZE; if (kernelOption == 0) { int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; printf("CUDA kernel addConstDouble3 launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock); addConstDouble3<<>>(numElements, d_A, constK, d_B); } else if (kernelOption == 1) { int blocksPerGrid =(numElements*3 + threadsPerBlock - 1) / threadsPerBlock; printf("CUDA kernel addConstDouble launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock); addConstDouble<<>>(numElements*3, (double *)d_A, constK, (double *)d_B); } else { fprintf(stderr, "** Invalid kernel option %d\n", kernelOption); exit(EXIT_FAILURE); } err = cudaGetLastError(); if (err != cudaSuccess) { fprintf(stderr, "Failed to launch kernel (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Copy the device result array in device memory to the host result vector // in host memory. RUNTIME_API_CALL(cudaMemcpy(h_B, d_B, size, cudaMemcpyDeviceToHost)); // Verify that the result vector is correct for (int i = 0; i < numElements; ++i) { if ((fabs(h_A[i].x + constK - h_B[i].x) > 1e-5) || (fabs(h_A[i].y + constK - h_B[i].y) > 1e-5) || (fabs(h_A[i].z + constK - h_B[i].z) > 1e-5)) { fprintf(stderr, "Result verification failed at element %d!\n", i); exit(EXIT_FAILURE); } } // Free device global memory RUNTIME_API_CALL(cudaFree(d_A)); RUNTIME_API_CALL(cudaFree(d_B)); // Free host memory free(h_A); free(h_B); printf("Done\n"); return 0; }