Kevin Stratford
kevin@epcc.ed.ac.uk
Material by: Alan Gray, Kevin Stratford, Rupert Nash
  
  
      dim3 structure 
     struct {
       unsigned int x;
       unsigned int y;
       unsigned int z;
     }; 
     
     
     type :: dim3
       integer :: x
       integer :: y
       integer :: z
     end type dim3
     
     
/* Consider the one-dimensional loop: */
for (int i = 0; i < LOOP_LENGTH; i++) {
   result[i] = 2*i;
}
    
  
__global__ void myKernel(int * result) {
  int i;
  i = threadIdx.x;
  result[i] = 2*i;
}
    
  
/* Kernel is launched by the host by specifying
 * Number of blocks (sometimes "blocksPerGrid")
 * Number of threads per block */
dim3 blocks;
dim3 threadsPerBlock;
blocks.x = 1;
threadsPerBlock.x = LOOP_LENGTH;
myKernel <<< blocks, threadsPerBlock >>> (result);
    
    
! In Fortran an analogous kernel is...
 
attributes(global) subroutine myKernel(result)
  integer, dimension(:) :: result
  integer               :: i
  i = threadIdx%x
  result(i) = 2*i
end subroutine myKernel
! ... with execution ...
blocks%x = 1
threadsPerBlock%x = LOOP_LENGTH
call myKernel <<< blocks, threadsPerBlock >>> (result)
    
  
/* One block only uses one SM; use of resources is very poor.
 * Usually want large arrays using many blocks. */
__global__ void myKernel(int * result) {
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  result[i] = 2*i;
}
/* ... with execution ... */
block.x = NBLOCKS;
threadsPerBlock.x = LOOP_LENGTH/NBLOCKS;
myKernel <<< blocks, threadsPerBlock >>> (result);
     
  
attributes(global) subroutine myKernel(result)
  integer, dimension(:) :: result
  integer               :: i
  i = (blockIdx%x - 1)*blockDim%x + threadIdx%x
  result(i) = 2*i
end subroutine myKernel
! ... with execution ...
blocks%x = NBLOCKS
threadsPerBlock%x = LOOP_LENGTH/NBLOCKS
call myKernel <<< blocks, threadsPerBlock >>> (result)
    
  
dim3 gridDim;    /* Number of blocks */
dim3 blockDim;   /* Number of threads per block */
    
    
dim3 blockIdx;   /* 0 <= blockIdx.x < gridDim.x etc */
    
    
dim3 threadIdx;  /* 0 <= threadIdx.x < blockDim.x etc */
    
  
type (dim3) :: gridDim   ! Number of blocks
type (dim3) :: blockDim  ! Number of threads per block
    
    
type (dim3) :: blockIdx  ! 1 <= blockIdx%x <= gridDim%x etc
    
    
type (dim3) :: threadIdx ! 1 <= threadIdx%x <= blockDim%x etc
    
  
__global__ void matrix2d(float a[N][N], float b[N][N],
                         float c[N][N]) {
  int j = blockIdx.x*blockDim.x + threadIdx.x;
  int i = blockIdx.y*blockDim.y + threadIdx.y;
  c[i][j] = a[i][j] + b[i][j];
}
    
    
/* ... with execution, e.g.,  ... */
dim3 blocksPerGrid(N/16, N/16, 1);
dim3 threadsPerBlock(16, 16, 1);
matrix2d <<< blocksPerGrid, threadsPerBlock >>> (a, b, c);
    
  
myKernel <<<blocksPerGrid, threadsPerBlock>>> (...)
/* ... could perform independent work here ... */
err = cudaDeviceSynchronize();
/* ... now safe to obtain results of kernel ... */
    
    cudaMemcpyAsync(), ...
      
/* For example, provide an allocation of "nSize" floats
 * in the device memory: */
float * data;
err = cudaMalloc(&data, nSize*sizeof(float));
...
err = cudaFree(data);
    
    cudaMemcpy() 
/* Copy host data values to device memory ... */
err = cudaMemcpy(dataDevice, dataHost, nSize*sizeof(float),
                 cudaMemcpyHostToDevice);
/* And back again ... */
err = cudaMemcpy(dataHost, dataDevice, nSize*sizeof(float),
                 cudaMemcpyDeviceToHost);
    
    
cudaError_t cudaMemcpy(void * dest, const void * src,
                       size_t count,
                       cudaMemcpyKind kind);
     
   device attribute
        
! Make an allocation in device memory:
real, device, allocatable :: dataDevice(:)
allocate(dataDevice(nSize), stat = ...)
...
deallocate(dataDevice)
    
    cudaMalloc(), cudaFree()
      
! Copy from host to device
dataDevice(:) = dataHost(:)
! ... and back again ...
dataHost(:) = dataDevice(:)
    
    
err = cudaMemcpy(dataDevice, dataHost, nSize,
                 cudaMemcpyHostToDevice)
    
  .cu by convention nvcc
$ nvcc -o example example.cu
    
    .cuf by convention
         nvfortran
         -cuda
      
$ nvfortran -cuda -o example example.cuf