1 files changed, 62 insertions, 0 deletions
diff --git a/vector_sum/main.cu b/vector_sum/main.cu
new file mode 100644
index 0000000..ba2e964
--- /dev/null
+++ b/vector_sum/main.cu
@@ -0,0 +1,62 @@
+#include <cstdio>
+__global__ void add(int N, int* a, int* b, int* out) {
+  const int id = blockIdx.x;
+  out[id]      = a[id] + b[id];
+}
+int main() {
+  constexpr int N = 100;
+  bool success       = false;
+  int  host_array[N] = {0};
+  int* dev_arrays[3] = {nullptr};
+  // Allocate device arrays.
+  for (int i = 0; i < 3; ++i) {
+    if (cudaMalloc(&dev_arrays[i], N * sizeof(int)) != cudaSuccess) {
+      goto cleanup;
+    }
+  }
+  // Fill the host array with values 0..N-1.
+  for (int i = 0; i < N; ++i) {
+    host_array[i] = i;
+  }
+  // Copy the host array to each of the first two device arrays.
+  for (int i = 0; i < 2; ++i) {
+    if (cudaMemcpy(
+            dev_arrays[i], host_array, N * sizeof(int),
+            cudaMemcpyHostToDevice) != cudaSuccess) {
+      goto cleanup;
+    }
+  }
+  // Add the first two arrays.
+  // N blocks, 1 thread per block.
+  add<<<N, 1>>>(N, dev_arrays[0], dev_arrays[1], dev_arrays[2]);
+  // Copy the result from the third array to the host.
+  if (cudaMemcpy(
+          host_array, dev_arrays[2], N * sizeof(int), cudaMemcpyDeviceToHost) !=
+      cudaSuccess) {
+    goto cleanup;
+  }
+  // Print the result.
+  for (int i = 0; i < N; ++i) {
+    printf("%d ", host_array[i]);
+  }
+  printf("\n");
+  success = true;
+cleanup:
+  for (int i = 0; i < 3; ++i) {
+    if (dev_arrays[i] != nullptr) {
+      cudaFree(dev_arrays[i]);
+    }
+  }
+  return success ? 0 : 1;
+}

diff --git a/vector_sum/main.cu b/vector_sum/main.cu new file mode 100644 index 0000000..ba2e964 --- /dev/null +++ b/vector_sum/main.cu
@@ -0,0 +1,62 @@
	1	#include <cstdio>
	2
	3	__global__ void add(int N, int* a, int* b, int* out) {
	4	const int id = blockIdx.x;
	5	out[id] = a[id] + b[id];
	6	}
	7
	8	int main() {
	9	constexpr int N = 100;
	10
	11	bool success = false;
	12	int host_array[N] = {0};
	13	int* dev_arrays[3] = {nullptr};
	14
	15	// Allocate device arrays.
	16	for (int i = 0; i < 3; ++i) {
	17	if (cudaMalloc(&dev_arrays[i], N * sizeof(int)) != cudaSuccess) {
	18	goto cleanup;
	19	}
	20	}
	21
	22	// Fill the host array with values 0..N-1.
	23	for (int i = 0; i < N; ++i) {
	24	host_array[i] = i;
	25	}
	26
	27	// Copy the host array to each of the first two device arrays.
	28	for (int i = 0; i < 2; ++i) {
	29	if (cudaMemcpy(
	30	dev_arrays[i], host_array, N * sizeof(int),
	31	cudaMemcpyHostToDevice) != cudaSuccess) {
	32	goto cleanup;
	33	}
	34	}
	35
	36	// Add the first two arrays.
	37	// N blocks, 1 thread per block.
	38	add<<<N, 1>>>(N, dev_arrays[0], dev_arrays[1], dev_arrays[2]);
	39
	40	// Copy the result from the third array to the host.
	41	if (cudaMemcpy(
	42	host_array, dev_arrays[2], N * sizeof(int), cudaMemcpyDeviceToHost) !=
	43	cudaSuccess) {
	44	goto cleanup;
	45	}
	46
	47	// Print the result.
	48	for (int i = 0; i < N; ++i) {
	49	printf("%d ", host_array[i]);
	50	}
	51	printf("\n");
	52
	53	success = true;
	54
	55	cleanup:
	56	for (int i = 0; i < 3; ++i) {
	57	if (dev_arrays[i] != nullptr) {
	58	cudaFree(dev_arrays[i]);
	59	}
	60	}
	61	return success ? 0 : 1;
	62	}