vector_sum/main.cu


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62

#include <cstdio>

__global__ void add(int N, int* a, int* b, int* out) {
  const int id = blockIdx.x;
  out[id]      = a[id] + b[id];
}

int main() {
  constexpr int N = 100;

  bool success       = false;
  int  host_array[N] = {0};
  int* dev_arrays[3] = {nullptr};

  // Allocate device arrays.
  for (int i = 0; i < 3; ++i) {
    if (cudaMalloc(&dev_arrays[i], N * sizeof(int)) != cudaSuccess) {
      goto cleanup;
    }
  }

  // Fill the host array with values 0..N-1.
  for (int i = 0; i < N; ++i) {
    host_array[i] = i;
  }

  // Copy the host array to each of the first two device arrays.
  for (int i = 0; i < 2; ++i) {
    if (cudaMemcpy(
            dev_arrays[i], host_array, N * sizeof(int),
            cudaMemcpyHostToDevice) != cudaSuccess) {
      goto cleanup;
    }
  }

  // Add the first two arrays.
  // N blocks, 1 thread per block.
  add<<<N, 1>>>(N, dev_arrays[0], dev_arrays[1], dev_arrays[2]);

  // Copy the result from the third array to the host.
  if (cudaMemcpy(
          host_array, dev_arrays[2], N * sizeof(int), cudaMemcpyDeviceToHost) !=
      cudaSuccess) {
    goto cleanup;
  }

  // Print the result.
  for (int i = 0; i < N; ++i) {
    printf("%d ", host_array[i]);
  }
  printf("\n");

  success = true;

cleanup:
  for (int i = 0; i < 3; ++i) {
    if (dev_arrays[i] != nullptr) {
      cudaFree(dev_arrays[i]);
    }
  }
  return success ? 0 : 1;
}