/* Consider a loop (scale with "a" constant): */
for (int index = 0; index < nElements; index++) {
x[index] = a*x[index];
}
/* Pattern... */
for (...)
/* Execution Policy... */
(int index = 0; index < nElements; index++)
/* Body... */
x[index] = a*x[index];
A combination of pattern and policy drives execution of the body
#pragma omp parallel for
for (int index = 0; index < nElements; index++) {
x[index] = a*x[index];
}
/* Pattern ... */
#pragma omp ... for
for (...)
/* Policy ... */
#pragma omp parallel ...
... (int index = 0; index < nElements; index++)
/* Body (as before) ... */
Distribute iterations of the body between asynchronous threads
#include "Kokkos_Core.hpp"
/* ... */
Kokkos::parallel_for(policy, body);
/* ...being pattern, policy, computational body */
The body is specified as a function object.
/* Use a function object... */
struct Scal {
double a_;
double * x_;
Scal(double a, double * x) : a_ (a), x_(x) {};
void operator() (int index) const {
x_[index] = a_*x_[index];
}
};
/* ... with a policy which states that the range is
* of the appropriate length "nElements" */
double a = 2.0;
double * x = new double[nElements];
Scal scal(a, x);
Kokkos::parallel_for(nElements, scal);
/* As before, we have in scope ... */
double a = 2.0;
double * x = new double[nElements];
/* ... */
Kokkos::parallel_for(nElements, [=] (int index) {
x[index] = a*x[index];
});
/* Consider a vector product = < a b > */
double * a = new double[nElements];
double * b = new double[nElements];
/* ... */
double result = 0.0;
for (int index = 0; index < nElements; index++) {
result += a[index]*b[index];
}
/* ... we have a reduction ... */
#omp pragma parallel for reduction (+: result)
for (int index = 0; index < nElements; index++) {
result += a[index]*b[index];
}
/* Pattern ... */
#pragma ... for reduction(+: result)
for (...)
/* Policy... */
#pragma omp parallel ...
... (int index = 0; index < nElements; index++)
/* ... use Kokkos::parallel_reduce() pattern ... */
double result = 0.0;
Kokkos::parallel_reduce(nElements, [=] (int index, double & sum) {
/* Form a contribution to the sum... */
sum += a[index]*b[index];
}, result);
/* nb.
* 1: variable sum is "thread-private" and managed by Kokkos.
*
* 2: nowhere have we said this is a sum: it is a default
* (one of many in Kokkos)
*/
/* Kokkos provides a lightweight class which represents
* one-dimensional, two-dimensional etc arrays: e.g.,: */
Kokkos::View < double * > x("my vector", nElements);
Kokkos::parallel_for(nElements, [=] (int index) {
x(index) = a*x(index);
});
Data associated with the View
is in the default
MemorySpace
Default controlled as part of compilation, e.g.:
> make KOKKOS_DEVICES=OpenMP
May be controlled explicitly via template argument
/* E.g., */
Kokkos::View < double *, CudaSpace > x("my vector", nElements)
/* A convenience for default memory space... */
typedef Kokkos::View < double * > ViewVectorType;
/* Declare vector in default space */
ViewVectorType x("my vector", nElements);
/* Declare a "host mirror view" */
ViewVectorType::HostMirror h_x = Kokkos::create_mirror_view(x);
/* Initialise the host view */
Kokkos::parallel_for(nElements, [=] (int index) {
h_x(index) = 1.0;
});
/* Explicit copy */
Kokkos::deep_copy(x, h_x);
A memory space has an associated ExecutionSpace
/* May set explicitly via the policy */
Kokkos::RangePolicy < HostSpace > policy(0, nElements);
Kokkos::parallel_for(policy, [=] (int index) {
h_x(index) = a*h_x(index);
});
/* Consider an inner product < y^T Ax > involving
* matrix A (M rows, N columns) */
double * A = new double[M*N];
double * x = new double[N];
double * y = new double[M];
...
result = 0.0;
for (int i = 0; i < M; i++) {
sum = 0.0;
for (int j = 0; j < N; j++) {
sum += A[i*N + j]*x[j];
}
result += y[i]*sum;
}
typedef Kokkos::LayoutRight Layout;
/* All views have a layout... */
Kokkos::View < double **, Layout > A("Matrix A", M, N);
Kokkos::View < double *, Layout > x("Vector x", N);
Kokkos::View < double *, Layout > y("Vector y", M);
/* ... */
Kokkos::parallel_reduce(M, [=] (int i, double & rSum) {
double sum = 0.0;
for (int j = 0; j < N; j++) {
sum += A(i,j)*x(j);
}
rSum += y(i)*sum;
}, result);
But is the layout, well, right?
Each ExecutionSpace
has associated default layout
Kokkos::LayoutRight
Kokkos::LayoutLeft
Appropriate layout crucial to performance.
/* Parallelism is over rows M ... */
Kokkos::parallel_reduce(M, [=] (int i, double & rSum) {
double sum = 0.0;
for (int j = 0; j < N; j++) {
sum += A(i,j)*x(j);
}
rSum += y(i)*sum;
}, result);
/* A shorthand: */
typedef Kokkos::TeamPolicy::member_type member_type;
/* Create a team policy in default execution space... */
Kokkos::TeamPolicy <> teamPolicy(numberOfTeams, teamSize);
Kokkos::parallel_reduce(teamPolicy,
[=] (const member_type & teamMember, double & rSum) {
/* All threads in team are active and share memory */
int myTeam = teamMember.league_rank();
int myRank = teamMember.team_rank();
/* Need parallelism (a reduction) over columns ...
* ... and compute contribution rSum ... */
}, result);
/* < y^T Ax > */
Kokkos::TeamPolicy <> teamPolicy(M, Kokkos::AUTO);
Kokkos::parallel_reduce(teamPolicy,
[=] (const member_type & teamMember, double & rSum) {
double sum = 0.0;
int i = teamMember.league_rank(); // ie., row
Kokkos::TeamThreadRange teamThreadPolicy(teamMember, N); // columns
Kokkos::parallel_reduce(teamThreadPolicy, [=] (int j, double & cSum) {
cSum += A(i,j)*x(j);
}, sum);
/* Care .... */
if (teamMember.team_rank() == 0) rSum += y(i)*sum;
}, result);
/* Schematically... */
/* Outer level is TeamPolicy */
TeamPolicy <> teamPolicy(numberOfTeams, threadsPerTeam, vectorLength);
Kokkos::parallel_for(teamPolicy, [=] (member_type & teamMember [,...]) {
/* Team level code ...*/
TeamThreadRange teamThreadRange(teamMember, rangeSize);
Kokkos::parallel_for(teamThreadRange, [=] (int index [,...]) {
/* Thread level code ... */
ThreadVectorRange threadVectorRange(teamMember, vectorRange);
Kokkos::parallel_for(threadVectorRange, [=] (int indexv [,...]) {
/* Vector level code ...*/
});
});
});