Add a multi GPU SAXPY example

47a847bd · Erik Strand · b3375cb0 · 47a847bd · 47a847bd · 47a847bd
Commit 47a847bd authored 4 years ago by Erik Strand
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,5 @@
 # binaries
 get_gpu_info
 saxpy
+saxpy_multi_gpu
+
--- a/Makefile
+++ b/Makefile
 .PHONY: all
-all: get_gpu_info saxpy
+all: get_gpu_info saxpy saxpy_multi_gpu

 get_gpu_info: get_gpu_info.cu
 	nvcc get_gpu_info.cu -o get_gpu_info

 saxpy: saxpy.cu
 	nvcc saxpy.cu -O3 -o saxpy
+
+saxpy_multi_gpu: saxpy_multi_gpu.cu
+	nvcc saxpy_multi_gpu.cu -O3 -o saxpy_multi_gpu
--- a/saxpy_multi_gpu.cu
+++ b/saxpy_multi_gpu.cu
+// This code performs a single precision a*X plus Y operation on multiple GPUs.
+// Adapated from https://developer.nvidia.com/blog/easy-introduction-cuda-c-and-c/
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <iostream>
+#include <thread>
+#include <vector>
+
+void saxpy_cpu(int n, float a, float *x, float *y) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = a * x[i] + y[i];
+    }
+}
+
+__global__
+void saxpy_gpu(int n, float a, float *x, float *y) {
+    int const i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n) {
+        y[i] = a * x[i] + y[i];
+    }
+}
+
+int main() {
+    // We'll put 2^20 numbers in each vector.
+    int const N = 1048576;
+    std::cout << "Performing SAXPY on vectors of dim " << N << ".\n";
+
+    // Determine how many GPUs we have.
+    int n_gpus;
+    cudaGetDeviceCount(&n_gpus);
+    std::cout << "Found " << n_gpus << " GPUs.\n\n";
+
+    // Allocate host (CPU) memory.
+    float *h_x, *h_y, *h_z;
+    h_x = (float*)malloc(N * sizeof(float));
+    h_y = (float*)malloc(N * sizeof(float));
+    h_z = (float*)malloc(N * sizeof(float));
+    std::vector<float*> h_z_gpu(n_gpus);
+    for (int i = 0; i < n_gpus; ++i) {
+        h_z_gpu[i] = (float*)malloc(N * sizeof(float));
+    }
+
+    // Initialize host data.
+    for (int i = 0; i < N; ++i) {
+        h_x[i] = 1.0f;
+        h_y[i] = 2.0f;
+        h_z[i] = 2.0f;
+    }
+
+    // Run the SAXPY kernel on each GPU. We use a separate thread to manage each GPU.
+    std::vector<std::thread> saxpy_threads(n_gpus);
+    std::vector<std::chrono::duration<double>> gpu_times(n_gpus);
+    for (int i = 0; i < n_gpus; ++i) {
+        saxpy_threads[i] = std::thread([&, i]() {
+            // Switch to the correct GPU.
+            cudaSetDevice(i);
+
+            // Allocate device (GPU) memory.
+            float *d_x, *d_y;
+            cudaMalloc(&d_x, N * sizeof(float));
+            cudaMalloc(&d_y, N * sizeof(float));
+
+            // Copy data to the GPU.
+            cudaMemcpy(d_x, h_x, N * sizeof(float), cudaMemcpyHostToDevice);
+            cudaMemcpy(d_y, h_y, N * sizeof(float), cudaMemcpyHostToDevice);
+
+            // Perform SAXPY on the GPU.
+            int threads_per_block = 256;
+            int blocks = (N + threads_per_block - 1) / threads_per_block;
+            auto start = std::chrono::high_resolution_clock::now();
+            saxpy_gpu<<<blocks, threads_per_block>>>(N, 2.0f, d_x, d_y);
+            cudaDeviceSynchronize();
+            auto stop = std::chrono::high_resolution_clock::now();
+            gpu_times[i] = stop - start;
+
+            // Copy results back to the CPU.
+            cudaMemcpy(h_z_gpu[i], d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
+            cudaDeviceSynchronize();
+
+            // Free memory.
+            cudaFree(d_x);
+            cudaFree(d_y);
+        });
+    }
+
+    // Perform SAXPY on the CPU.
+    auto start = std::chrono::high_resolution_clock::now();
+    saxpy_cpu(N, 2.0f, h_x, h_z);
+    auto stop = std::chrono::high_resolution_clock::now();
+    float cpu_time = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
+
+    // Wait for all the GPU work to finish.
+    for (auto& thread: saxpy_threads) {
+        thread.join();
+    }
+
+    // Print work times.
+    std::cout << "CPU time: " << cpu_time << " microseconds\n";
+    for (int i = 0; i < n_gpus; ++i) {
+        auto micros = std::chrono::duration_cast<std::chrono::microseconds>(gpu_times[i]);
+        std::cout << "GPU " << i << " time: " << micros.count() << " microseconds\n";
+    }
+    std::cout << '\n';
+
+    // Compare results.
+    std::vector<float> gpu_errors(n_gpus);
+    for (int i = 0; i < n_gpus; ++i) {
+        saxpy_threads[i] = std::thread([&, i]() {
+            float max_error = 0.0f;
+            for (int j = 0; j < N; j++) {
+                max_error = std::max(max_error, std::abs(h_z_gpu[i][j] - h_z[j]));
+            }
+            gpu_errors[i] = max_error;
+        });
+    }
+
+    // Wait for all the comparisons to finish.
+    for (auto& thread: saxpy_threads) {
+        thread.join();
+    }
+
+    // Print comparisons.
+    for (int i = 0; i < n_gpus; ++i) {
+        std::cout << "GPU " << i << " max error: " << gpu_errors[i] << '\n';
+    }
+    std::cout << '\n';
+
+    free(h_x);
+    free(h_y);
+    free(h_z);
+    for (int i = 0; i < n_gpus; ++i) {
+        free(h_z_gpu[i]);
+    }
+
+    return 0;
+}