Material by: Alan Gray, Kevin Stratford, Rupert Nash
for (it = 0; it < nTimeSteps; it++) {
myCheapHostOperation(hostData);
cudaMemcpy(..., cudaMemcpyHostToDevice);
myExpensiveKernel <<<...>>> (deviceData, ...);
cudaMemcpy(..., cudaMemcpyDeviceToHost);
}
cudaMemcpy(..., cudaMemcpyHostToDevice);
for (it = 0; it < nTimeSteps; it++) {
myCheapKernel <<< ... >>> (deviceData, ...);
myExpensiveKernel <<< ... >>> (deviceData, ...)
}
cudeMemcpy(..., cudaMemcpyDeviceToHost);
for (i = 0; i < 512; i++) {
for (j = 0; j < 512; j++) {
/* ... work ... */
/* C: recall right-most index runs fastest */
for (i = 0; i < NI; i++) {
for (j = 0; j < NJ; j++) {
output[i][j] = input[i][j];
}
}
! Fortran: recall left-most index runs fastest
do j = 1, NJ
do i = 1, NI
output(i,j) = input(i,j)
end do
end do
consecutive threads access consecutive memory locations
threadsPerBlock = (nThreads, 1, 1)
/* In C: */
idx = blockIdx.x*blockDim.x + threadIdx.x;
output[idx] = input[idx];
! In Fortran:
idx = (blockIdx%x - 1)*blockDim%x + threadIdx%x
output(idx) = input(idx)
/* Bad: consecutive threads have strided access */
i = blockIdx.x*blockDim.x + threadIdx.x;
for (j = 0; j < NJ; j++) {
output[i][j] = input[i][j];
}
/* Good: consecutive threads have contiguous access */
j = blockIdx.x*blockDim.x + threadIdx.x;
for (i = 0; i < NI; i++) {
output[i][j] = input[i][j];
}
! Bad: consecutive threads have strided access
j = blockIdx.x*blockDim.x + threadIdx.x;
do i = 1, NI
output(i, j) = input(i, j);
end do
! Good: consecutive threads have contiguous access
i = blockIdx.x*blockDim.x + threadIdx.x;
do j = 1, NJ
output(i, j) = input(i, j);
end do
blocksPerGrid = (nBlocksX, nBlocksY, 1)
threadsPerBlock = (nThreadsX, nThreadsY, 1)
/* C: note apparent transposition of i, j here... */
int j = blockIdx.x*blockDim.x + threadIdx.x;
int i = blockIdx.y*blockDim.y + threadIdx.y;
output[i][j] = input[i][j];
! Fortran: looks more natural
i = (blockIdx%x - 1)*blockDim%x + threadIdx%x
j = (blockIdx%y - 1)*blockDim%y + threadIdx%y
output(i, j) = input(i, j)
__global__ void kernel1d(int N, float* data) {
for (int i = blockIdx.x*blockDim.x + threadIdx.x;
i < N;
i += blockDim.x * gridDim.x) {
// loop body
}
int main() {
int devId;
cudaGetDevice(&devId);
int numSM;
cudaDeviceGetAttribute(&numSM,
cudaDevAttrMultiProcessorCount, devId);
kernel1d<<<32*numSM, 128>>>(N, dev_data);
}
Can also run kernel in serial for debugging
/* Bad: threads in same warp diverge... */
tid = blockIdx.x*blockDim.x + threadIdx.x;
if (tid % 2 == 0) {
/* Threads 0, 2, 4, ... do one thing ... */
}
else {
/* Threads 1, 3, 5, ... do something else */
}
/* Good: threads in same warp follow same path ...
* Note use of the internal variable "warpSize" */
tid = blockIdx.x*blockDim.x + threadIdx.x;
if ( (tid / warpSize) % 2 == 0) {
/* Threads 0, 1, 2, 3, 4, ... do one thing ... */
}
else {
/* Threads 32, 33, 34, 35, ... do something else */
}
$ nvcc -Xptxas -v ...
#SBATCH --qos=gpu
# Further batch system directives
module load nvidia/nvhpc
nsys profile -o reconstruct-${SLURM_JOB_ID} ./reconstruct
Install from the NVIDIA website, and
open the .qdrep
file.
#SBATCH --qos=gpu
# Further batch system directives
module load nvidia/nvhpc
ncu -o reconstruct-${SLURM_JOB_ID} reconstruct
ncu -o reconstruct-${SLURM_JOB_ID} \
--kernel-name 'inverseEdgeDetect' \
--launch-skip 1 \
--lauch-count 10 \
--set detailed \
reconstruct
Install from the NVIDIA website, and open the
.ncu-rep
files you copy from Cirrus.
#include <cuda_profiler_api.h>
cudaProfilerStart()
before region of
interestcudaProfilerStop()
after region of
interest#include <nvToolsExt.h>
nvtxMark("Custom message");
void ComputeNextTimeStep() {
nvtxRangePush("Do timestep");
compute_kernel <<< ... >>>(args);
cudaDeviceSynchronize();
nvtxRangePop();
}