--- /dev/null
+/*
+ * Copyright 1993-2008 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws. Users and possessors of this source code
+ * are hereby granted a nonexclusive, royalty-free license to use this code
+ * in individual and commercial software.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users. This source code is a "commercial item" as
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
+ * "commercial computer software" and "commercial computer software
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ *
+ * Any use of this source code in individual and commercial software must
+ * include, in the user documentation and internal comments to the code,
+ * the above Disclaimer and U.S. Government End Users Notice.
+ */
+
+// includes, system
+#include <stdio.h>
+#include <assert.h>
+
+// Simple utility function to check for CUDA runtime errors
+void checkCUDAError(const char *msg);
+
+// Part3: implement the kernel
+__global__ void reverseArrayBlock(int *d_out, int *d_in)
+{
+ int inOffset = blockDim.x * blockIdx.x;
+ int outOffset = blockDim.x * (gridDim.x - 1 - blockIdx.x);
+ int in = inOffset + threadIdx.x;
+ int out = outOffset + (blockDim.x - 1 - threadIdx.x);
+ d_out[out] = d_in[in];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv)
+{
+
+ // pointer for host memory and size
+ int *h_a;
+ int dimA = 256 * 1024; // 256K elements (1MB total)
+
+ // pointer for device memory
+ int *d_b, *d_a;
+
+ // define grid and block size
+ int numThreadsPerBlock = 256;
+
+ // Part 1: compute number of blocks needed based on array size and desired block size
+ int numBlocks = dimA / numThreadsPerBlock;
+
+ // allocate host and device memory
+ size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
+ h_a = (int *)malloc(memSize);
+ cudaMalloc((void **)&d_a, memSize);
+ cudaMalloc((void **)&d_b, memSize);
+
+ // Initialize input array on host
+ for (int i = 0; i < dimA; ++i) {
+ h_a[i] = i;
+ }
+ // Copy host array to device array
+ cudaMemcpy(d_a, h_a, memSize, cudaMemcpyHostToDevice);
+
+ // launch kernel
+ dim3 dimGrid(numBlocks);
+ dim3 dimBlock(numThreadsPerBlock);
+ reverseArrayBlock <<< dimGrid, dimBlock >>> (d_b, d_a);
+
+ // block until the device has completed
+ cudaThreadSynchronize();
+
+ // check if kernel execution generated an error
+ // Check for any CUDA errors
+ checkCUDAError("kernel invocation");
+
+ // device to host copy
+ cudaMemcpy(h_a, d_b, memSize, cudaMemcpyDeviceToHost);
+
+ // Check for any CUDA errors
+ checkCUDAError("memcpy");
+
+ // verify the data returned to the host is correct
+ for (int i = 0; i < dimA; i++) {
+ assert(h_a[i] == dimA - 1 - i);
+ }
+ // free device memory
+ cudaFree(d_a);
+ cudaFree(d_b);
+
+ // free host memory
+ free(h_a);
+
+ // If the program makes it this far, then the results are correct and
+ // there are no run-time errors. Good work!
+ printf("Correct!\n");
+ return 0;
+}
+
+void checkCUDAError(const char *msg)
+{
+ cudaError_t err = cudaGetLastError();
+ if (cudaSuccess != err) {
+ fprintf(stderr, "Cuda error: %s: %s.\n", msg,
+ cudaGetErrorString(err));
+ exit(EXIT_FAILURE);
+ }
+}