From e696306bae683aebe418bf3de8fdeedf967ac4a7 Mon Sep 17 00:00:00 2001
From: "W. Trevor King" <wking@drexel.edu>
Date: Thu, 25 Nov 2010 11:01:10 -0500
Subject: [PATCH] Add CUDA by Example examples.

---
 content/GPUs/index.shtml                      |  37 +++--
 src/cuda_by_example/.htaccess                 |   1 +
 src/cuda_by_example/.make_tar                 |   1 +
 src/cuda_by_example/cudaMallocAndMemcpy.cu    | 114 ++++++++++++++
 src/cuda_by_example/myFirstKernel.cu          | 113 ++++++++++++++
 .../reverseArray_multiblock.cu                | 128 ++++++++++++++++
 .../reverseArray_multiblock_fast.cu           | 143 ++++++++++++++++++
 .../reverseArray_singleblock.cu               | 124 +++++++++++++++
 8 files changed, 649 insertions(+), 12 deletions(-)
 create mode 100644 src/cuda_by_example/.htaccess
 create mode 100644 src/cuda_by_example/.make_tar
 create mode 100644 src/cuda_by_example/cudaMallocAndMemcpy.cu
 create mode 100644 src/cuda_by_example/myFirstKernel.cu
 create mode 100644 src/cuda_by_example/reverseArray_multiblock.cu
 create mode 100644 src/cuda_by_example/reverseArray_multiblock_fast.cu
 create mode 100644 src/cuda_by_example/reverseArray_singleblock.cu
diff --git a/content/GPUs/index.shtml b/content/GPUs/index.shtml
index 4230520..6f82799 100644
--- a/content/GPUs/index.shtml
+++ b/content/GPUs/index.shtml
@@ -70,18 +70,18 @@ Fortran, Java and Matlab.</p>
   <li>An interesting primer is that
     of <a href="resources/Seland_2008_Cuda_Programming.pdf">Seland</a>.</li>
   <li>Also look at
-		the <a href="http://developer.download.nvidia.com/compute/cuda/3_2/toolkit/docs/CUDA_C_Programming_Guide.pdf">CUDA
-		C Programming Guide</a> and other documentation distributed in
-		the <a href="http://www.nvidia.com/object/cuda_get.html">CUDA
-		toolkit</a>.  Because the toolkit is a critical part of a working
-		CUDA installation, it may already be installed on your system.
-		On <code>borg3</code>, look in <code>/usr/local/cuda/</code>.</li>
-	<li>The CUDA SDK (download links on
-		the <a href="http://www.nvidia.com/object/cuda_get.html">CUDA
-		toolkit page</a>) contain lots of useful examples.  Your sysadmin
-		may have installed it while setting up CUDA on your system.
-		On <code>borg3</code> look
-		in <code>/usr/local/cuda/doc/NVIDIA_GPU_Computing_SDK/</code>.</li>
+    the <a href="http://developer.download.nvidia.com/compute/cuda/3_2/toolkit/docs/CUDA_C_Programming_Guide.pdf">CUDA
+    C Programming Guide</a> and other documentation distributed in
+    the <a href="http://www.nvidia.com/object/cuda_get.html">CUDA
+    toolkit</a>.  Because the toolkit is a critical part of a working
+    CUDA installation, it may already be installed on your system.
+    On <code>borg3</code>, look in <code>/usr/local/cuda/</code>.</li>
+  <li>The CUDA SDK (download links on
+    the <a href="http://www.nvidia.com/object/cuda_get.html">CUDA
+    toolkit page</a>) contain lots of useful examples.  Your sysadmin
+    may have installed it while setting up CUDA on your system.
+    On <code>borg3</code> look
+    in <code>/usr/local/cuda/doc/NVIDIA_GPU_Computing_SDK/</code>.</li>
   <li>David Kirk, NVDIA, and Wen-Mei Hwua, ECE, U of Illinos, have
     written an excellent textbook on
     CUDA: <a href="http://www.elsevierdirect.com/morgan_kaufmann/kirk/">Programming
@@ -93,6 +93,19 @@ Fortran, Java and Matlab.</p>
     Gems 3</a> contains great demonstration GPU codes.</li>
 </ul>
 
+<h2 id="learn">Learning CUDA by examples</h2>
 
+<p>Jason Sanders and Edward
+Kandrot's <a href="http://developer.nvidia.com/object/cuda-by-example.html">CUDA
+by Example</a> contains a number of example problems.  Here are
+solutions to some of the problems:</p>
+
+<ul>
+   <li><a href="../../src/cuda_by_example/cudaMallocAndMemcpy.cu">cudaMallocAndMemcpy.cu</a></li>
+   <li><a href="../../src/cuda_by_example/myFirstKernel.cu">myFirstKernel.cu</a></li>
+   <li><a href="../../src/cuda_by_example/reverseArray_singleblock.cu">reverseArray_singleblock.cu</a></li>
+   <li><a href="../../src/cuda_by_example/reverseArray_multiblock.cu">reverseArray_multiblock.cu</a></li>
+   <li><a href="../../src/cuda_by_example/reverseArray_multiblock_fast.cu">reverseArray_multiblock_fast.cu</a></li>
+</ul>
 
 <!--#include virtual="$root_directory/shared/footer.shtml"-->
diff --git a/src/cuda_by_example/.htaccess b/src/cuda_by_example/.htaccess
new file mode 100644
index 0000000..ca04e34
--- /dev/null
+++ b/src/cuda_by_example/.htaccess
@@ -0,0 +1 @@
+AddType text/plain .cu
diff --git a/src/cuda_by_example/.make_tar b/src/cuda_by_example/.make_tar
new file mode 100644
index 0000000..80e52ce
--- /dev/null
+++ b/src/cuda_by_example/.make_tar
@@ -0,0 +1 @@
+./
diff --git a/src/cuda_by_example/cudaMallocAndMemcpy.cu b/src/cuda_by_example/cudaMallocAndMemcpy.cu
new file mode 100644
index 0000000..50aac75
--- /dev/null
+++ b/src/cuda_by_example/cudaMallocAndMemcpy.cu
@@ -0,0 +1,114 @@
+/*
+ * Copyright 1993-2008 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.  Users and possessors of this source code
+ * are hereby granted a nonexclusive, royalty-free license to use this code
+ * in individual and commercial software.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.   This source code is a "commercial item" as
+ * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer  software"  and "commercial computer software
+ * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ *
+ * Any use of this source code in individual and commercial software must
+ * include, in the user documentation and internal comments to the code,
+ * the above Disclaimer and U.S. Government End Users Notice.
+ */
+
+// includes, system
+#include <stdio.h>
+#include <assert.h>
+
+// Simple utility function to check for CUDA runtime errors
+void checkCUDAError(const char *msg);
+
+///////////////////////////////////////////////////////////////////////////////
+// Program main
+///////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv)
+{
+
+	// pointer and dimension for host memory
+	int n, dimA;
+	float *h_a;
+
+	// pointers for device memory
+	float *d_a, *d_b;
+
+	// allocate and initialize host memory
+	// Bonus: try using cudaMallocHost in place of malloc
+	dimA = 8;
+	h_a = (float *)malloc(dimA * sizeof(float));
+	for (n = 0; n < dimA; n++) {
+		h_a[n] = (float)n;
+	}
+	// Part 1 of 5: allocate device memory
+	size_t memSize = dimA * sizeof(float);
+	cudaMalloc((void **)&d_a, memSize);
+	cudaMalloc((void **)&d_b, memSize);
+
+	// Part 2 of 5: host to device memory copy
+	cudaMemcpy(d_a, h_a, memSize, cudaMemcpyHostToDevice);
+
+	// Part 3 of 5: device to device memory copy
+	cudaMemcpy(d_b, d_a, memSize, cudaMemcpyDeviceToDevice);
+
+	// clear host memory
+	for (n = 0; n < dimA; n++) {
+		h_a[n] = 0.f;
+	}
+
+	// Part 4 of 5: device to host copy
+	cudaMemcpy(h_a, d_b, memSize, cudaMemcpyDeviceToHost);
+
+	// Check for any CUDA errors
+	checkCUDAError("cudaMemcpy calls");
+
+	// verify the data on the host is correct
+	for (n = 0; n < dimA; n++) {
+		assert(h_a[n] == (float)n);
+	}
+	// Part 5 of 5: free device memory pointers d_a and d_b
+	cudaFree(d_b);
+	cudaFree(d_a);
+
+	// Check for any CUDA errors
+	checkCUDAError("cudaFree");
+
+	// free host memory pointer h_a
+	// Bonus: be sure to use cudaFreeHost for memory allocated with cudaMallocHost
+	free(h_a);
+
+	// If the program makes it this far, then the results are correct and
+	// there are no run-time errors.  Good work!
+	printf("Correct!\n");
+	return 0;
+}
+
+void checkCUDAError(const char *msg)
+{
+	cudaError_t err = cudaGetLastError();
+	if (cudaSuccess != err) {
+		fprintf(stderr, "Cuda error: %s: %s.\n", msg,
+			cudaGetErrorString(err));
+		exit(-1);
+	}
+}
diff --git a/src/cuda_by_example/myFirstKernel.cu b/src/cuda_by_example/myFirstKernel.cu
new file mode 100644
index 0000000..8e23935
--- /dev/null
+++ b/src/cuda_by_example/myFirstKernel.cu
@@ -0,0 +1,113 @@
+/*
+ * Copyright 1993-2008 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.  Users and possessors of this source code
+ * are hereby granted a nonexclusive, royalty-free license to use this code
+ * in individual and commercial software.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.   This source code is a "commercial item" as
+ * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer  software"  and "commercial computer software
+ * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ *
+ * Any use of this source code in individual and commercial software must
+ * include, in the user documentation and internal comments to the code,
+ * the above Disclaimer and U.S. Government End Users Notice.
+ */
+
+// includes, system
+#include <stdio.h>
+#include <assert.h>
+
+// Simple utility function to check for CUDA runtime errors
+void checkCUDAError(const char *msg);
+
+// Part 3 of 5: implement the kernel
+__global__ void myFirstKernel(int *d_a)
+{
+	int idx = blockIdx.x * blockDim.x + threadIdx.x;
+	d_a[idx] = 1000 * blockIdx.x + threadIdx.x;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv)
+{
+
+	// pointer for host memory
+	int *h_a;
+
+	// pointer for device memory
+	int *d_a;
+
+	// define grid and block size
+	int numBlocks = 8;
+	int numThreadsPerBlock = 8;
+
+	// Part 1 of 5: allocate host and device memory
+	size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
+	h_a = (int *)malloc(memSize);
+	cudaMalloc((void **)&d_a, memSize);
+
+	// Part 2 of 5: launch kernel
+	dim3 dimGrid(numBlocks);
+	dim3 dimBlock(numThreadsPerBlock);
+	myFirstKernel <<< dimGrid, dimBlock >>> (d_a);
+
+	// block until the device has completed
+	cudaThreadSynchronize();
+
+	// check if kernel execution generated an error
+	checkCUDAError("kernel execution");
+
+	// Part 4 of 5: device to host copy
+	cudaMemcpy(h_a, d_a, memSize, cudaMemcpyDeviceToHost);
+
+	// Check for any CUDA errors
+	checkCUDAError("cudaMemcpy");
+
+	// Part 5 of 5: verify the data returned to the host is correct
+	for (int i = 0; i < numBlocks; i++) {
+		for (int j = 0; j < numThreadsPerBlock; j++) {
+			assert(h_a[i * numThreadsPerBlock + j] == 1000 * i + j);
+	}}
+	// free device memory
+	cudaFree(d_a);
+
+	// free host memory
+	free(h_a);
+
+	// If the program makes it this far, then the results are correct and
+	// there are no run-time errors.  Good work!
+	printf("Correct!\n");
+	return 0;
+}
+
+void checkCUDAError(const char *msg)
+{
+	cudaError_t err = cudaGetLastError();
+	if (cudaSuccess != err) {
+		fprintf(stderr, "Cuda error: %s: %s.\n", msg,
+			cudaGetErrorString(err));
+		exit(-1);
+	}
+}
diff --git a/src/cuda_by_example/reverseArray_multiblock.cu b/src/cuda_by_example/reverseArray_multiblock.cu
new file mode 100644
index 0000000..504e488
--- /dev/null
+++ b/src/cuda_by_example/reverseArray_multiblock.cu
@@ -0,0 +1,128 @@
+/*
+ * Copyright 1993-2008 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.  Users and possessors of this source code
+ * are hereby granted a nonexclusive, royalty-free license to use this code
+ * in individual and commercial software.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.   This source code is a "commercial item" as
+ * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer  software"  and "commercial computer software
+ * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ *
+ * Any use of this source code in individual and commercial software must
+ * include, in the user documentation and internal comments to the code,
+ * the above Disclaimer and U.S. Government End Users Notice.
+ */
+
+// includes, system
+#include <stdio.h>
+#include <assert.h>
+
+// Simple utility function to check for CUDA runtime errors
+void checkCUDAError(const char *msg);
+
+// Part3: implement the kernel
+__global__ void reverseArrayBlock(int *d_out, int *d_in)
+{
+	int inOffset = blockDim.x * blockIdx.x;
+	int outOffset = blockDim.x * (gridDim.x - 1 - blockIdx.x);
+	int in = inOffset + threadIdx.x;
+	int out = outOffset + (blockDim.x - 1 - threadIdx.x);
+	d_out[out] = d_in[in];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv)
+{
+
+	// pointer for host memory and size
+	int *h_a;
+	int dimA = 256 * 1024;	// 256K elements (1MB total)
+
+	// pointer for device memory
+	int *d_b, *d_a;
+
+	// define grid and block size
+	int numThreadsPerBlock = 256;
+
+	// Part 1: compute number of blocks needed based on array size and desired block size
+	int numBlocks = dimA / numThreadsPerBlock;
+
+	// allocate host and device memory
+	size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
+	h_a = (int *)malloc(memSize);
+	cudaMalloc((void **)&d_a, memSize);
+	cudaMalloc((void **)&d_b, memSize);
+
+	// Initialize input array on host
+	for (int i = 0; i < dimA; ++i) {
+		h_a[i] = i;
+	}
+	// Copy host array to device array
+	cudaMemcpy(d_a, h_a, memSize, cudaMemcpyHostToDevice);
+
+	// launch kernel
+	dim3 dimGrid(numBlocks);
+	dim3 dimBlock(numThreadsPerBlock);
+	reverseArrayBlock <<< dimGrid, dimBlock >>> (d_b, d_a);
+
+	// block until the device has completed
+	cudaThreadSynchronize();
+
+	// check if kernel execution generated an error
+	// Check for any CUDA errors
+	checkCUDAError("kernel invocation");
+
+	// device to host copy
+	cudaMemcpy(h_a, d_b, memSize, cudaMemcpyDeviceToHost);
+
+	// Check for any CUDA errors
+	checkCUDAError("memcpy");
+
+	// verify the data returned to the host is correct
+	for (int i = 0; i < dimA; i++) {
+		assert(h_a[i] == dimA - 1 - i);
+	}
+	// free device memory
+	cudaFree(d_a);
+	cudaFree(d_b);
+
+	// free host memory
+	free(h_a);
+
+	// If the program makes it this far, then the results are correct and
+	// there are no run-time errors.  Good work!
+	printf("Correct!\n");
+	return 0;
+}
+
+void checkCUDAError(const char *msg)
+{
+	cudaError_t err = cudaGetLastError();
+	if (cudaSuccess != err) {
+		fprintf(stderr, "Cuda error: %s: %s.\n", msg,
+			cudaGetErrorString(err));
+		exit(EXIT_FAILURE);
+	}
+}
diff --git a/src/cuda_by_example/reverseArray_multiblock_fast.cu b/src/cuda_by_example/reverseArray_multiblock_fast.cu
new file mode 100644
index 0000000..0231006
--- /dev/null
+++ b/src/cuda_by_example/reverseArray_multiblock_fast.cu
@@ -0,0 +1,143 @@
+/*
+ * Copyright 1993-2008 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.  Users and possessors of this source code
+ * are hereby granted a nonexclusive, royalty-free license to use this code
+ * in individual and commercial software.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.   This source code is a "commercial item" as
+ * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer  software"  and "commercial computer software
+ * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ *
+ * Any use of this source code in individual and commercial software must
+ * include, in the user documentation and internal comments to the code,
+ * the above Disclaimer and U.S. Government End Users Notice.
+ */
+
+// includes, system
+#include <stdio.h>
+#include <assert.h>
+
+// Simple utility function to check for CUDA runtime errors
+void checkCUDAError(const char *msg);
+
+// Part 2 of 2: implement the fast kernel using shared memory
+__global__ void reverseArrayBlock(int *d_out, int *d_in)
+{
+	extern __shared__ int s_data[];
+	int inOffset = blockDim.x * blockIdx.x;
+	int in = inOffset + threadIdx.x;
+
+	// Load one element per thread from device memory and store it
+	// *in reversed order* into temporary shared memory
+	s_data[blockDim.x - 1 - threadIdx.x] = d_in[in];
+
+	// Block until all threads in the block have written their data to shared mem
+	__syncthreads();
+
+	// write the data from shared memory in forward order,
+	// but to the reversed block offset as before
+	int outOffset = blockDim.x * (gridDim.x - 1 - blockIdx.x);
+	int out = outOffset + threadIdx.x;
+	d_out[out] = s_data[threadIdx.x];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv)
+{
+
+	// pointer for host memory and size
+	int *h_a;
+	int dimA = 256 * 1024;	// 256K elements (1MB total)
+
+	// pointer for device memory
+	int *d_b, *d_a;
+
+	// define grid and block size
+	int numThreadsPerBlock = 256;
+
+	// Compute number of blocks needed based on array size and desired block size
+	int numBlocks = dimA / numThreadsPerBlock;
+
+	// Part 1 of 2: Compute the number of bytes of shared memory needed
+	// This is used in the kernel invocation below
+	int sharedMemSize = numThreadsPerBlock * sizeof(int);
+
+	// allocate host and device memory
+	size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
+	h_a = (int *)malloc(memSize);
+	cudaMalloc((void **)&d_a, memSize);
+	cudaMalloc((void **)&d_b, memSize);
+
+	// Initialize input array on host
+	for (int i = 0; i < dimA; ++i) {
+		h_a[i] = i;
+	}
+	// Copy host array to device array
+	cudaMemcpy(d_a, h_a, memSize, cudaMemcpyHostToDevice);
+
+	// launch kernel
+	dim3 dimGrid(numBlocks);
+	dim3 dimBlock(numThreadsPerBlock);
+	reverseArrayBlock <<< dimGrid, dimBlock, sharedMemSize >>> (d_b, d_a);
+
+	// block until the device has completed
+	cudaThreadSynchronize();
+
+	// check if kernel execution generated an error
+	// Check for any CUDA errors
+	checkCUDAError("kernel invocation");
+
+	// device to host copy
+	cudaMemcpy(h_a, d_b, memSize, cudaMemcpyDeviceToHost);
+
+	// Check for any CUDA errors
+	checkCUDAError("memcpy");
+
+	// verify the data returned to the host is correct
+	for (int i = 0; i < dimA; i++) {
+		assert(h_a[i] == dimA - 1 - i);
+	}
+	// free device memory
+	cudaFree(d_a);
+	cudaFree(d_b);
+
+	// free host memory
+	free(h_a);
+
+	// If the program makes it this far, then the results are correct and
+	// there are no run-time errors.  Good work!
+	printf("Correct!\n");
+	return 0;
+}
+
+void checkCUDAError(const char *msg)
+{
+	cudaError_t err = cudaGetLastError();
+	if (cudaSuccess != err) {
+		fprintf(stderr, "Cuda error: %s: %s.\n", msg,
+			cudaGetErrorString(err));
+		exit(EXIT_FAILURE);
+	}
+}
diff --git a/src/cuda_by_example/reverseArray_singleblock.cu b/src/cuda_by_example/reverseArray_singleblock.cu
new file mode 100644
index 0000000..3161f4a
--- /dev/null
+++ b/src/cuda_by_example/reverseArray_singleblock.cu
@@ -0,0 +1,124 @@
+/*
+ * Copyright 1993-2008 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.  Users and possessors of this source code
+ * are hereby granted a nonexclusive, royalty-free license to use this code
+ * in individual and commercial software.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.   This source code is a "commercial item" as
+ * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer  software"  and "commercial computer software
+ * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ *
+ * Any use of this source code in individual and commercial software must
+ * include, in the user documentation and internal comments to the code,
+ * the above Disclaimer and U.S. Government End Users Notice.
+ */
+
+// includes, system
+#include <stdio.h>
+#include <assert.h>
+
+// Simple utility function to check for CUDA runtime errors
+void checkCUDAError(const char *msg);
+
+// Part 1 of 1: implement the kernel
+__global__ void reverseArrayBlock(int *d_out, int *d_in)
+{
+	int in = threadIdx.x;
+	int out = blockDim.x - 1 - threadIdx.x;
+	d_out[out] = d_in[in];
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv)
+{
+
+	// pointer for host memory and size
+	int *h_a;
+	int dimA = 256;
+
+	// pointer for device memory
+	int *d_b, *d_a;
+
+	// define grid and block size
+	int numBlocks = 1;
+	int numThreadsPerBlock = dimA;
+
+	// allocate host and device memory
+	size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
+	h_a = (int *)malloc(memSize);
+	cudaMalloc((void **)&d_a, memSize);
+	cudaMalloc((void **)&d_b, memSize);
+
+	// Initialize input array on host
+	for (int i = 0; i < dimA; ++i) {
+		h_a[i] = i;
+	}
+	// Copy host array to device array
+	cudaMemcpy(d_a, h_a, memSize, cudaMemcpyHostToDevice);
+
+	// launch kernel
+	dim3 dimGrid(numBlocks);
+	dim3 dimBlock(numThreadsPerBlock);
+	reverseArrayBlock <<< dimGrid, dimBlock >>> (d_b, d_a);
+
+	// block until the device has completed
+	cudaThreadSynchronize();
+
+	// check if kernel execution generated an error
+	// Check for any CUDA errors
+	checkCUDAError("kernel invocation");
+
+	// device to host copy
+	cudaMemcpy(h_a, d_b, memSize, cudaMemcpyDeviceToHost);
+
+	// Check for any CUDA errors
+	checkCUDAError("memcpy");
+
+	// verify the data returned to the host is correct
+	for (int i = 0; i < dimA; i++) {
+		assert(h_a[i] == dimA - 1 - i);
+	}
+	// free device memory
+	cudaFree(d_a);
+	cudaFree(d_b);
+
+	// free host memory
+	free(h_a);
+
+	// If the program makes it this far, then the results are correct and
+	// there are no run-time errors.  Good work!
+	printf("Correct!\n");
+	return 0;
+}
+
+void checkCUDAError(const char *msg)
+{
+	cudaError_t err = cudaGetLastError();
+	if (cudaSuccess != err) {
+		fprintf(stderr, "Cuda error: %s: %s.\n", msg,
+			cudaGetErrorString(err));
+		exit(EXIT_FAILURE);
+	}
+}
-- 
2.26.2