From e696306bae683aebe418bf3de8fdeedf967ac4a7 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Thu, 25 Nov 2010 11:01:10 -0500 Subject: [PATCH] Add CUDA by Example examples. --- content/GPUs/index.shtml | 37 +++-- src/cuda_by_example/.htaccess | 1 + src/cuda_by_example/.make_tar | 1 + src/cuda_by_example/cudaMallocAndMemcpy.cu | 114 ++++++++++++++ src/cuda_by_example/myFirstKernel.cu | 113 ++++++++++++++ .../reverseArray_multiblock.cu | 128 ++++++++++++++++ .../reverseArray_multiblock_fast.cu | 143 ++++++++++++++++++ .../reverseArray_singleblock.cu | 124 +++++++++++++++ 8 files changed, 649 insertions(+), 12 deletions(-) create mode 100644 src/cuda_by_example/.htaccess create mode 100644 src/cuda_by_example/.make_tar create mode 100644 src/cuda_by_example/cudaMallocAndMemcpy.cu create mode 100644 src/cuda_by_example/myFirstKernel.cu create mode 100644 src/cuda_by_example/reverseArray_multiblock.cu create mode 100644 src/cuda_by_example/reverseArray_multiblock_fast.cu create mode 100644 src/cuda_by_example/reverseArray_singleblock.cu diff --git a/content/GPUs/index.shtml b/content/GPUs/index.shtml index 4230520..6f82799 100644 --- a/content/GPUs/index.shtml +++ b/content/GPUs/index.shtml @@ -70,18 +70,18 @@ Fortran, Java and Matlab.

  • An interesting primer is that of Seland.
  • Also look at - the CUDA - C Programming Guide and other documentation distributed in - the CUDA - toolkit. Because the toolkit is a critical part of a working - CUDA installation, it may already be installed on your system. - On borg3, look in /usr/local/cuda/.
  • -
  • The CUDA SDK (download links on - the CUDA - toolkit page) contain lots of useful examples. Your sysadmin - may have installed it while setting up CUDA on your system. - On borg3 look - in /usr/local/cuda/doc/NVIDIA_GPU_Computing_SDK/.
  • + the CUDA + C Programming Guide and other documentation distributed in + the CUDA + toolkit. Because the toolkit is a critical part of a working + CUDA installation, it may already be installed on your system. + On borg3, look in /usr/local/cuda/. +
  • The CUDA SDK (download links on + the CUDA + toolkit page) contain lots of useful examples. Your sysadmin + may have installed it while setting up CUDA on your system. + On borg3 look + in /usr/local/cuda/doc/NVIDIA_GPU_Computing_SDK/.
  • David Kirk, NVDIA, and Wen-Mei Hwua, ECE, U of Illinos, have written an excellent textbook on CUDA: Programming @@ -93,6 +93,19 @@ Fortran, Java and Matlab.

    Gems 3
    contains great demonstration GPU codes.
  • +

    Learning CUDA by examples

    +

    Jason Sanders and Edward +Kandrot's CUDA +by Example contains a number of example problems. Here are +solutions to some of the problems:

    + + diff --git a/src/cuda_by_example/.htaccess b/src/cuda_by_example/.htaccess new file mode 100644 index 0000000..ca04e34 --- /dev/null +++ b/src/cuda_by_example/.htaccess @@ -0,0 +1 @@ +AddType text/plain .cu diff --git a/src/cuda_by_example/.make_tar b/src/cuda_by_example/.make_tar new file mode 100644 index 0000000..80e52ce --- /dev/null +++ b/src/cuda_by_example/.make_tar @@ -0,0 +1 @@ +./ diff --git a/src/cuda_by_example/cudaMallocAndMemcpy.cu b/src/cuda_by_example/cudaMallocAndMemcpy.cu new file mode 100644 index 0000000..50aac75 --- /dev/null +++ b/src/cuda_by_example/cudaMallocAndMemcpy.cu @@ -0,0 +1,114 @@ +/* + * Copyright 1993-2008 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO USER: + * + * This source code is subject to NVIDIA ownership rights under U.S. and + * international Copyright laws. Users and possessors of this source code + * are hereby granted a nonexclusive, royalty-free license to use this code + * in individual and commercial software. + * + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE + * OR PERFORMANCE OF THIS SOURCE CODE. + * + * U.S. Government End Users. This source code is a "commercial item" as + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of + * "commercial computer software" and "commercial computer software + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) + * and is provided to the U.S. Government only as a commercial end item. + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the + * source code with only those rights set forth herein. + * + * Any use of this source code in individual and commercial software must + * include, in the user documentation and internal comments to the code, + * the above Disclaimer and U.S. Government End Users Notice. + */ + +// includes, system +#include +#include + +// Simple utility function to check for CUDA runtime errors +void checkCUDAError(const char *msg); + +/////////////////////////////////////////////////////////////////////////////// +// Program main +/////////////////////////////////////////////////////////////////////////////// +int main(int argc, char **argv) +{ + + // pointer and dimension for host memory + int n, dimA; + float *h_a; + + // pointers for device memory + float *d_a, *d_b; + + // allocate and initialize host memory + // Bonus: try using cudaMallocHost in place of malloc + dimA = 8; + h_a = (float *)malloc(dimA * sizeof(float)); + for (n = 0; n < dimA; n++) { + h_a[n] = (float)n; + } + // Part 1 of 5: allocate device memory + size_t memSize = dimA * sizeof(float); + cudaMalloc((void **)&d_a, memSize); + cudaMalloc((void **)&d_b, memSize); + + // Part 2 of 5: host to device memory copy + cudaMemcpy(d_a, h_a, memSize, cudaMemcpyHostToDevice); + + // Part 3 of 5: device to device memory copy + cudaMemcpy(d_b, d_a, memSize, cudaMemcpyDeviceToDevice); + + // clear host memory + for (n = 0; n < dimA; n++) { + h_a[n] = 0.f; + } + + // Part 4 of 5: device to host copy + cudaMemcpy(h_a, d_b, memSize, cudaMemcpyDeviceToHost); + + // Check for any CUDA errors + checkCUDAError("cudaMemcpy calls"); + + // verify the data on the host is correct + for (n = 0; n < dimA; n++) { + assert(h_a[n] == (float)n); + } + // Part 5 of 5: free device memory pointers d_a and d_b + cudaFree(d_b); + cudaFree(d_a); + + // Check for any CUDA errors + checkCUDAError("cudaFree"); + + // free host memory pointer h_a + // Bonus: be sure to use cudaFreeHost for memory allocated with cudaMallocHost + free(h_a); + + // If the program makes it this far, then the results are correct and + // there are no run-time errors. Good work! + printf("Correct!\n"); + return 0; +} + +void checkCUDAError(const char *msg) +{ + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err) { + fprintf(stderr, "Cuda error: %s: %s.\n", msg, + cudaGetErrorString(err)); + exit(-1); + } +} diff --git a/src/cuda_by_example/myFirstKernel.cu b/src/cuda_by_example/myFirstKernel.cu new file mode 100644 index 0000000..8e23935 --- /dev/null +++ b/src/cuda_by_example/myFirstKernel.cu @@ -0,0 +1,113 @@ +/* + * Copyright 1993-2008 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO USER: + * + * This source code is subject to NVIDIA ownership rights under U.S. and + * international Copyright laws. Users and possessors of this source code + * are hereby granted a nonexclusive, royalty-free license to use this code + * in individual and commercial software. + * + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE + * OR PERFORMANCE OF THIS SOURCE CODE. + * + * U.S. Government End Users. This source code is a "commercial item" as + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of + * "commercial computer software" and "commercial computer software + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) + * and is provided to the U.S. Government only as a commercial end item. + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the + * source code with only those rights set forth herein. + * + * Any use of this source code in individual and commercial software must + * include, in the user documentation and internal comments to the code, + * the above Disclaimer and U.S. Government End Users Notice. + */ + +// includes, system +#include +#include + +// Simple utility function to check for CUDA runtime errors +void checkCUDAError(const char *msg); + +// Part 3 of 5: implement the kernel +__global__ void myFirstKernel(int *d_a) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + d_a[idx] = 1000 * blockIdx.x + threadIdx.x; +} + +//////////////////////////////////////////////////////////////////////////////// +// Program main +//////////////////////////////////////////////////////////////////////////////// +int main(int argc, char **argv) +{ + + // pointer for host memory + int *h_a; + + // pointer for device memory + int *d_a; + + // define grid and block size + int numBlocks = 8; + int numThreadsPerBlock = 8; + + // Part 1 of 5: allocate host and device memory + size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int); + h_a = (int *)malloc(memSize); + cudaMalloc((void **)&d_a, memSize); + + // Part 2 of 5: launch kernel + dim3 dimGrid(numBlocks); + dim3 dimBlock(numThreadsPerBlock); + myFirstKernel <<< dimGrid, dimBlock >>> (d_a); + + // block until the device has completed + cudaThreadSynchronize(); + + // check if kernel execution generated an error + checkCUDAError("kernel execution"); + + // Part 4 of 5: device to host copy + cudaMemcpy(h_a, d_a, memSize, cudaMemcpyDeviceToHost); + + // Check for any CUDA errors + checkCUDAError("cudaMemcpy"); + + // Part 5 of 5: verify the data returned to the host is correct + for (int i = 0; i < numBlocks; i++) { + for (int j = 0; j < numThreadsPerBlock; j++) { + assert(h_a[i * numThreadsPerBlock + j] == 1000 * i + j); + }} + // free device memory + cudaFree(d_a); + + // free host memory + free(h_a); + + // If the program makes it this far, then the results are correct and + // there are no run-time errors. Good work! + printf("Correct!\n"); + return 0; +} + +void checkCUDAError(const char *msg) +{ + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err) { + fprintf(stderr, "Cuda error: %s: %s.\n", msg, + cudaGetErrorString(err)); + exit(-1); + } +} diff --git a/src/cuda_by_example/reverseArray_multiblock.cu b/src/cuda_by_example/reverseArray_multiblock.cu new file mode 100644 index 0000000..504e488 --- /dev/null +++ b/src/cuda_by_example/reverseArray_multiblock.cu @@ -0,0 +1,128 @@ +/* + * Copyright 1993-2008 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO USER: + * + * This source code is subject to NVIDIA ownership rights under U.S. and + * international Copyright laws. Users and possessors of this source code + * are hereby granted a nonexclusive, royalty-free license to use this code + * in individual and commercial software. + * + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE + * OR PERFORMANCE OF THIS SOURCE CODE. + * + * U.S. Government End Users. This source code is a "commercial item" as + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of + * "commercial computer software" and "commercial computer software + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) + * and is provided to the U.S. Government only as a commercial end item. + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the + * source code with only those rights set forth herein. + * + * Any use of this source code in individual and commercial software must + * include, in the user documentation and internal comments to the code, + * the above Disclaimer and U.S. Government End Users Notice. + */ + +// includes, system +#include +#include + +// Simple utility function to check for CUDA runtime errors +void checkCUDAError(const char *msg); + +// Part3: implement the kernel +__global__ void reverseArrayBlock(int *d_out, int *d_in) +{ + int inOffset = blockDim.x * blockIdx.x; + int outOffset = blockDim.x * (gridDim.x - 1 - blockIdx.x); + int in = inOffset + threadIdx.x; + int out = outOffset + (blockDim.x - 1 - threadIdx.x); + d_out[out] = d_in[in]; +} + +//////////////////////////////////////////////////////////////////////////////// +// Program main +//////////////////////////////////////////////////////////////////////////////// +int main(int argc, char **argv) +{ + + // pointer for host memory and size + int *h_a; + int dimA = 256 * 1024; // 256K elements (1MB total) + + // pointer for device memory + int *d_b, *d_a; + + // define grid and block size + int numThreadsPerBlock = 256; + + // Part 1: compute number of blocks needed based on array size and desired block size + int numBlocks = dimA / numThreadsPerBlock; + + // allocate host and device memory + size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int); + h_a = (int *)malloc(memSize); + cudaMalloc((void **)&d_a, memSize); + cudaMalloc((void **)&d_b, memSize); + + // Initialize input array on host + for (int i = 0; i < dimA; ++i) { + h_a[i] = i; + } + // Copy host array to device array + cudaMemcpy(d_a, h_a, memSize, cudaMemcpyHostToDevice); + + // launch kernel + dim3 dimGrid(numBlocks); + dim3 dimBlock(numThreadsPerBlock); + reverseArrayBlock <<< dimGrid, dimBlock >>> (d_b, d_a); + + // block until the device has completed + cudaThreadSynchronize(); + + // check if kernel execution generated an error + // Check for any CUDA errors + checkCUDAError("kernel invocation"); + + // device to host copy + cudaMemcpy(h_a, d_b, memSize, cudaMemcpyDeviceToHost); + + // Check for any CUDA errors + checkCUDAError("memcpy"); + + // verify the data returned to the host is correct + for (int i = 0; i < dimA; i++) { + assert(h_a[i] == dimA - 1 - i); + } + // free device memory + cudaFree(d_a); + cudaFree(d_b); + + // free host memory + free(h_a); + + // If the program makes it this far, then the results are correct and + // there are no run-time errors. Good work! + printf("Correct!\n"); + return 0; +} + +void checkCUDAError(const char *msg) +{ + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err) { + fprintf(stderr, "Cuda error: %s: %s.\n", msg, + cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} diff --git a/src/cuda_by_example/reverseArray_multiblock_fast.cu b/src/cuda_by_example/reverseArray_multiblock_fast.cu new file mode 100644 index 0000000..0231006 --- /dev/null +++ b/src/cuda_by_example/reverseArray_multiblock_fast.cu @@ -0,0 +1,143 @@ +/* + * Copyright 1993-2008 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO USER: + * + * This source code is subject to NVIDIA ownership rights under U.S. and + * international Copyright laws. Users and possessors of this source code + * are hereby granted a nonexclusive, royalty-free license to use this code + * in individual and commercial software. + * + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE + * OR PERFORMANCE OF THIS SOURCE CODE. + * + * U.S. Government End Users. This source code is a "commercial item" as + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of + * "commercial computer software" and "commercial computer software + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) + * and is provided to the U.S. Government only as a commercial end item. + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the + * source code with only those rights set forth herein. + * + * Any use of this source code in individual and commercial software must + * include, in the user documentation and internal comments to the code, + * the above Disclaimer and U.S. Government End Users Notice. + */ + +// includes, system +#include +#include + +// Simple utility function to check for CUDA runtime errors +void checkCUDAError(const char *msg); + +// Part 2 of 2: implement the fast kernel using shared memory +__global__ void reverseArrayBlock(int *d_out, int *d_in) +{ + extern __shared__ int s_data[]; + int inOffset = blockDim.x * blockIdx.x; + int in = inOffset + threadIdx.x; + + // Load one element per thread from device memory and store it + // *in reversed order* into temporary shared memory + s_data[blockDim.x - 1 - threadIdx.x] = d_in[in]; + + // Block until all threads in the block have written their data to shared mem + __syncthreads(); + + // write the data from shared memory in forward order, + // but to the reversed block offset as before + int outOffset = blockDim.x * (gridDim.x - 1 - blockIdx.x); + int out = outOffset + threadIdx.x; + d_out[out] = s_data[threadIdx.x]; +} + +//////////////////////////////////////////////////////////////////////////////// +// Program main +//////////////////////////////////////////////////////////////////////////////// +int main(int argc, char **argv) +{ + + // pointer for host memory and size + int *h_a; + int dimA = 256 * 1024; // 256K elements (1MB total) + + // pointer for device memory + int *d_b, *d_a; + + // define grid and block size + int numThreadsPerBlock = 256; + + // Compute number of blocks needed based on array size and desired block size + int numBlocks = dimA / numThreadsPerBlock; + + // Part 1 of 2: Compute the number of bytes of shared memory needed + // This is used in the kernel invocation below + int sharedMemSize = numThreadsPerBlock * sizeof(int); + + // allocate host and device memory + size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int); + h_a = (int *)malloc(memSize); + cudaMalloc((void **)&d_a, memSize); + cudaMalloc((void **)&d_b, memSize); + + // Initialize input array on host + for (int i = 0; i < dimA; ++i) { + h_a[i] = i; + } + // Copy host array to device array + cudaMemcpy(d_a, h_a, memSize, cudaMemcpyHostToDevice); + + // launch kernel + dim3 dimGrid(numBlocks); + dim3 dimBlock(numThreadsPerBlock); + reverseArrayBlock <<< dimGrid, dimBlock, sharedMemSize >>> (d_b, d_a); + + // block until the device has completed + cudaThreadSynchronize(); + + // check if kernel execution generated an error + // Check for any CUDA errors + checkCUDAError("kernel invocation"); + + // device to host copy + cudaMemcpy(h_a, d_b, memSize, cudaMemcpyDeviceToHost); + + // Check for any CUDA errors + checkCUDAError("memcpy"); + + // verify the data returned to the host is correct + for (int i = 0; i < dimA; i++) { + assert(h_a[i] == dimA - 1 - i); + } + // free device memory + cudaFree(d_a); + cudaFree(d_b); + + // free host memory + free(h_a); + + // If the program makes it this far, then the results are correct and + // there are no run-time errors. Good work! + printf("Correct!\n"); + return 0; +} + +void checkCUDAError(const char *msg) +{ + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err) { + fprintf(stderr, "Cuda error: %s: %s.\n", msg, + cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} diff --git a/src/cuda_by_example/reverseArray_singleblock.cu b/src/cuda_by_example/reverseArray_singleblock.cu new file mode 100644 index 0000000..3161f4a --- /dev/null +++ b/src/cuda_by_example/reverseArray_singleblock.cu @@ -0,0 +1,124 @@ +/* + * Copyright 1993-2008 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO USER: + * + * This source code is subject to NVIDIA ownership rights under U.S. and + * international Copyright laws. Users and possessors of this source code + * are hereby granted a nonexclusive, royalty-free license to use this code + * in individual and commercial software. + * + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE + * OR PERFORMANCE OF THIS SOURCE CODE. + * + * U.S. Government End Users. This source code is a "commercial item" as + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of + * "commercial computer software" and "commercial computer software + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) + * and is provided to the U.S. Government only as a commercial end item. + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the + * source code with only those rights set forth herein. + * + * Any use of this source code in individual and commercial software must + * include, in the user documentation and internal comments to the code, + * the above Disclaimer and U.S. Government End Users Notice. + */ + +// includes, system +#include +#include + +// Simple utility function to check for CUDA runtime errors +void checkCUDAError(const char *msg); + +// Part 1 of 1: implement the kernel +__global__ void reverseArrayBlock(int *d_out, int *d_in) +{ + int in = threadIdx.x; + int out = blockDim.x - 1 - threadIdx.x; + d_out[out] = d_in[in]; +} + +//////////////////////////////////////////////////////////////////////////////// +// Program main +//////////////////////////////////////////////////////////////////////////////// +int main(int argc, char **argv) +{ + + // pointer for host memory and size + int *h_a; + int dimA = 256; + + // pointer for device memory + int *d_b, *d_a; + + // define grid and block size + int numBlocks = 1; + int numThreadsPerBlock = dimA; + + // allocate host and device memory + size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int); + h_a = (int *)malloc(memSize); + cudaMalloc((void **)&d_a, memSize); + cudaMalloc((void **)&d_b, memSize); + + // Initialize input array on host + for (int i = 0; i < dimA; ++i) { + h_a[i] = i; + } + // Copy host array to device array + cudaMemcpy(d_a, h_a, memSize, cudaMemcpyHostToDevice); + + // launch kernel + dim3 dimGrid(numBlocks); + dim3 dimBlock(numThreadsPerBlock); + reverseArrayBlock <<< dimGrid, dimBlock >>> (d_b, d_a); + + // block until the device has completed + cudaThreadSynchronize(); + + // check if kernel execution generated an error + // Check for any CUDA errors + checkCUDAError("kernel invocation"); + + // device to host copy + cudaMemcpy(h_a, d_b, memSize, cudaMemcpyDeviceToHost); + + // Check for any CUDA errors + checkCUDAError("memcpy"); + + // verify the data returned to the host is correct + for (int i = 0; i < dimA; i++) { + assert(h_a[i] == dimA - 1 - i); + } + // free device memory + cudaFree(d_a); + cudaFree(d_b); + + // free host memory + free(h_a); + + // If the program makes it this far, then the results are correct and + // there are no run-time errors. Good work! + printf("Correct!\n"); + return 0; +} + +void checkCUDAError(const char *msg) +{ + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err) { + fprintf(stderr, "Cuda error: %s: %s.\n", msg, + cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} -- 2.26.2