Skip to content
Snippets Groups Projects
Commit 47a847bd authored by Erik Strand's avatar Erik Strand
Browse files

Add a multi GPU SAXPY example

parent b3375cb0
Branches develop master
No related tags found
No related merge requests found
......@@ -5,3 +5,5 @@
# binaries
get_gpu_info
saxpy
saxpy_multi_gpu
.PHONY: all
all: get_gpu_info saxpy
all: get_gpu_info saxpy saxpy_multi_gpu
get_gpu_info: get_gpu_info.cu
nvcc get_gpu_info.cu -o get_gpu_info
saxpy: saxpy.cu
nvcc saxpy.cu -O3 -o saxpy
saxpy_multi_gpu: saxpy_multi_gpu.cu
nvcc saxpy_multi_gpu.cu -O3 -o saxpy_multi_gpu
// This code performs a single precision a*X plus Y operation on multiple GPUs.
// Adapated from https://developer.nvidia.com/blog/easy-introduction-cuda-c-and-c/
#include <algorithm>
#include <chrono>
#include <cmath>
#include <iostream>
#include <thread>
#include <vector>
void saxpy_cpu(int n, float a, float *x, float *y) {
for (int i = 0; i < n; ++i) {
y[i] = a * x[i] + y[i];
}
}
__global__
void saxpy_gpu(int n, float a, float *x, float *y) {
int const i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) {
y[i] = a * x[i] + y[i];
}
}
int main() {
// We'll put 2^20 numbers in each vector.
int const N = 1048576;
std::cout << "Performing SAXPY on vectors of dim " << N << ".\n";
// Determine how many GPUs we have.
int n_gpus;
cudaGetDeviceCount(&n_gpus);
std::cout << "Found " << n_gpus << " GPUs.\n\n";
// Allocate host (CPU) memory.
float *h_x, *h_y, *h_z;
h_x = (float*)malloc(N * sizeof(float));
h_y = (float*)malloc(N * sizeof(float));
h_z = (float*)malloc(N * sizeof(float));
std::vector<float*> h_z_gpu(n_gpus);
for (int i = 0; i < n_gpus; ++i) {
h_z_gpu[i] = (float*)malloc(N * sizeof(float));
}
// Initialize host data.
for (int i = 0; i < N; ++i) {
h_x[i] = 1.0f;
h_y[i] = 2.0f;
h_z[i] = 2.0f;
}
// Run the SAXPY kernel on each GPU. We use a separate thread to manage each GPU.
std::vector<std::thread> saxpy_threads(n_gpus);
std::vector<std::chrono::duration<double>> gpu_times(n_gpus);
for (int i = 0; i < n_gpus; ++i) {
saxpy_threads[i] = std::thread([&, i]() {
// Switch to the correct GPU.
cudaSetDevice(i);
// Allocate device (GPU) memory.
float *d_x, *d_y;
cudaMalloc(&d_x, N * sizeof(float));
cudaMalloc(&d_y, N * sizeof(float));
// Copy data to the GPU.
cudaMemcpy(d_x, h_x, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, h_y, N * sizeof(float), cudaMemcpyHostToDevice);
// Perform SAXPY on the GPU.
int threads_per_block = 256;
int blocks = (N + threads_per_block - 1) / threads_per_block;
auto start = std::chrono::high_resolution_clock::now();
saxpy_gpu<<<blocks, threads_per_block>>>(N, 2.0f, d_x, d_y);
cudaDeviceSynchronize();
auto stop = std::chrono::high_resolution_clock::now();
gpu_times[i] = stop - start;
// Copy results back to the CPU.
cudaMemcpy(h_z_gpu[i], d_y, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
// Free memory.
cudaFree(d_x);
cudaFree(d_y);
});
}
// Perform SAXPY on the CPU.
auto start = std::chrono::high_resolution_clock::now();
saxpy_cpu(N, 2.0f, h_x, h_z);
auto stop = std::chrono::high_resolution_clock::now();
float cpu_time = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
// Wait for all the GPU work to finish.
for (auto& thread: saxpy_threads) {
thread.join();
}
// Print work times.
std::cout << "CPU time: " << cpu_time << " microseconds\n";
for (int i = 0; i < n_gpus; ++i) {
auto micros = std::chrono::duration_cast<std::chrono::microseconds>(gpu_times[i]);
std::cout << "GPU " << i << " time: " << micros.count() << " microseconds\n";
}
std::cout << '\n';
// Compare results.
std::vector<float> gpu_errors(n_gpus);
for (int i = 0; i < n_gpus; ++i) {
saxpy_threads[i] = std::thread([&, i]() {
float max_error = 0.0f;
for (int j = 0; j < N; j++) {
max_error = std::max(max_error, std::abs(h_z_gpu[i][j] - h_z[j]));
}
gpu_errors[i] = max_error;
});
}
// Wait for all the comparisons to finish.
for (auto& thread: saxpy_threads) {
thread.join();
}
// Print comparisons.
for (int i = 0; i < n_gpus; ++i) {
std::cout << "GPU " << i << " max error: " << gpu_errors[i] << '\n';
}
std::cout << '\n';
free(h_x);
free(h_y);
free(h_z);
for (int i = 0; i < n_gpus; ++i) {
free(h_z_gpu[i]);
}
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment