Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
cuda
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
pub
hello-world
cuda
Commits
47a847bd
Commit
47a847bd
authored
4 years ago
by
Erik Strand
Browse files
Options
Downloads
Patches
Plain Diff
Add a multi GPU SAXPY example
parent
b3375cb0
Branches
develop
master
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
.gitignore
+2
-0
2 additions, 0 deletions
.gitignore
Makefile
+4
-1
4 additions, 1 deletion
Makefile
saxpy_multi_gpu.cu
+138
-0
138 additions, 0 deletions
saxpy_multi_gpu.cu
with
144 additions
and
1 deletion
.gitignore
+
2
−
0
View file @
47a847bd
...
...
@@ -5,3 +5,5 @@
# binaries
get_gpu_info
saxpy
saxpy_multi_gpu
This diff is collapsed.
Click to expand it.
Makefile
+
4
−
1
View file @
47a847bd
.PHONY
:
all
all
:
get_gpu_info saxpy
all
:
get_gpu_info saxpy
saxpy_multi_gpu
get_gpu_info
:
get_gpu_info.cu
nvcc get_gpu_info.cu
-o
get_gpu_info
saxpy
:
saxpy.cu
nvcc saxpy.cu
-O3
-o
saxpy
saxpy_multi_gpu
:
saxpy_multi_gpu.cu
nvcc saxpy_multi_gpu.cu
-O3
-o
saxpy_multi_gpu
This diff is collapsed.
Click to expand it.
saxpy_multi_gpu.cu
0 → 100644
+
138
−
0
View file @
47a847bd
// This code performs a single precision a*X plus Y operation on multiple GPUs.
// Adapated from https://developer.nvidia.com/blog/easy-introduction-cuda-c-and-c/
#include
<algorithm>
#include
<chrono>
#include
<cmath>
#include
<iostream>
#include
<thread>
#include
<vector>
void
saxpy_cpu
(
int
n
,
float
a
,
float
*
x
,
float
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
a
*
x
[
i
]
+
y
[
i
];
}
}
__global__
void
saxpy_gpu
(
int
n
,
float
a
,
float
*
x
,
float
*
y
)
{
int
const
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
<
n
)
{
y
[
i
]
=
a
*
x
[
i
]
+
y
[
i
];
}
}
int
main
()
{
// We'll put 2^20 numbers in each vector.
int
const
N
=
1048576
;
std
::
cout
<<
"Performing SAXPY on vectors of dim "
<<
N
<<
".
\n
"
;
// Determine how many GPUs we have.
int
n_gpus
;
cudaGetDeviceCount
(
&
n_gpus
);
std
::
cout
<<
"Found "
<<
n_gpus
<<
" GPUs.
\n\n
"
;
// Allocate host (CPU) memory.
float
*
h_x
,
*
h_y
,
*
h_z
;
h_x
=
(
float
*
)
malloc
(
N
*
sizeof
(
float
));
h_y
=
(
float
*
)
malloc
(
N
*
sizeof
(
float
));
h_z
=
(
float
*
)
malloc
(
N
*
sizeof
(
float
));
std
::
vector
<
float
*>
h_z_gpu
(
n_gpus
);
for
(
int
i
=
0
;
i
<
n_gpus
;
++
i
)
{
h_z_gpu
[
i
]
=
(
float
*
)
malloc
(
N
*
sizeof
(
float
));
}
// Initialize host data.
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
h_x
[
i
]
=
1.0
f
;
h_y
[
i
]
=
2.0
f
;
h_z
[
i
]
=
2.0
f
;
}
// Run the SAXPY kernel on each GPU. We use a separate thread to manage each GPU.
std
::
vector
<
std
::
thread
>
saxpy_threads
(
n_gpus
);
std
::
vector
<
std
::
chrono
::
duration
<
double
>>
gpu_times
(
n_gpus
);
for
(
int
i
=
0
;
i
<
n_gpus
;
++
i
)
{
saxpy_threads
[
i
]
=
std
::
thread
([
&
,
i
]()
{
// Switch to the correct GPU.
cudaSetDevice
(
i
);
// Allocate device (GPU) memory.
float
*
d_x
,
*
d_y
;
cudaMalloc
(
&
d_x
,
N
*
sizeof
(
float
));
cudaMalloc
(
&
d_y
,
N
*
sizeof
(
float
));
// Copy data to the GPU.
cudaMemcpy
(
d_x
,
h_x
,
N
*
sizeof
(
float
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
d_y
,
h_y
,
N
*
sizeof
(
float
),
cudaMemcpyHostToDevice
);
// Perform SAXPY on the GPU.
int
threads_per_block
=
256
;
int
blocks
=
(
N
+
threads_per_block
-
1
)
/
threads_per_block
;
auto
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
saxpy_gpu
<<<
blocks
,
threads_per_block
>>>
(
N
,
2.0
f
,
d_x
,
d_y
);
cudaDeviceSynchronize
();
auto
stop
=
std
::
chrono
::
high_resolution_clock
::
now
();
gpu_times
[
i
]
=
stop
-
start
;
// Copy results back to the CPU.
cudaMemcpy
(
h_z_gpu
[
i
],
d_y
,
N
*
sizeof
(
float
),
cudaMemcpyDeviceToHost
);
cudaDeviceSynchronize
();
// Free memory.
cudaFree
(
d_x
);
cudaFree
(
d_y
);
});
}
// Perform SAXPY on the CPU.
auto
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
saxpy_cpu
(
N
,
2.0
f
,
h_x
,
h_z
);
auto
stop
=
std
::
chrono
::
high_resolution_clock
::
now
();
float
cpu_time
=
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
stop
-
start
).
count
();
// Wait for all the GPU work to finish.
for
(
auto
&
thread
:
saxpy_threads
)
{
thread
.
join
();
}
// Print work times.
std
::
cout
<<
"CPU time: "
<<
cpu_time
<<
" microseconds
\n
"
;
for
(
int
i
=
0
;
i
<
n_gpus
;
++
i
)
{
auto
micros
=
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
gpu_times
[
i
]);
std
::
cout
<<
"GPU "
<<
i
<<
" time: "
<<
micros
.
count
()
<<
" microseconds
\n
"
;
}
std
::
cout
<<
'\n'
;
// Compare results.
std
::
vector
<
float
>
gpu_errors
(
n_gpus
);
for
(
int
i
=
0
;
i
<
n_gpus
;
++
i
)
{
saxpy_threads
[
i
]
=
std
::
thread
([
&
,
i
]()
{
float
max_error
=
0.0
f
;
for
(
int
j
=
0
;
j
<
N
;
j
++
)
{
max_error
=
std
::
max
(
max_error
,
std
::
abs
(
h_z_gpu
[
i
][
j
]
-
h_z
[
j
]));
}
gpu_errors
[
i
]
=
max_error
;
});
}
// Wait for all the comparisons to finish.
for
(
auto
&
thread
:
saxpy_threads
)
{
thread
.
join
();
}
// Print comparisons.
for
(
int
i
=
0
;
i
<
n_gpus
;
++
i
)
{
std
::
cout
<<
"GPU "
<<
i
<<
" max error: "
<<
gpu_errors
[
i
]
<<
'\n'
;
}
std
::
cout
<<
'\n'
;
free
(
h_x
);
free
(
h_y
);
free
(
h_z
);
for
(
int
i
=
0
;
i
<
n_gpus
;
++
i
)
{
free
(
h_z_gpu
[
i
]);
}
return
0
;
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment