Examples¶
First example¶
To better understand this model, we make an example with block-based matrix multiplication. Given two matrices, the program will multiply them and produce a new matrix.
If there are two input matrices A and B of size N and the output matrix C. The multiplication of matrices A and B can be performed using the following code:
void MatMul(int &A, int &B, int &C) {
for (int i = 0; i < N; ++i){
for (int j = 0; j < N ; ++j) {
C[i][j] = 0;
for (int k = 0; k < N ; ++k) {
C[i][j] += A[i][k] * B[k][j];
}
}
}
}
It is possible to use a block partitioned matrix product that involves only algebra on submatrices of the factors. The partitioning of the factors is not arbitrary, however, and requires conformable partitions between two matrices A and B such that all submatrix products that will be used are defined.
The following figure shows the block-based matrix multiplication.
In this case, the code would be:
void BlockMatMul(BlockMatrix &A, BlockMatrix &B, BlockMatrix &C) {
// Go through all the blocks of the matrix.
for (int i = 0; i < N / BS; ++i)
{
for (int j = 0; j < N / BS; ++j)
{
float *BlockC = C.GetBlock(i, j);
for (int k = 0; k < N / BS; ++k) {
float *BlockA = A.GetBlock(i, k);
float *BlockB = B.GetBlock(k,j);
// Go through the block.
for(int ii = 0; ii < BS; ii++)
for(int jj = 0; jj < BS; jj++) {
for(int kk = 0; kk < BS; ++kk)
BlockC[ii + jj * BS] += BlockA[ii + kk * BS] * BlockB[kk + jj * BS];
}
}
}
}
}
In our example, the BlockMatrix class is only used as a utility wrapper to split the whole matrix into blocks so as to benefit from data locality (each block is contained by a different array).
We can parallelize the code by performing the multiplication of each pair of blocks in a different node using the following code:
void BlockMatMul(BlockMatrix &A, BlockMatrix &B, BlockMatrix &C) {
#pragma omp parallel
#pragma omp single
for (int i = 0; i < N / BS; ++i)
for (int j = 0; j < N / BS; ++j) {
float *BlockC = C.GetBlock(i, j);
for (int k = 0; k < N / BS; ++k) {
float *BlockA = A.GetBlock(i, k);
float *BlockB = B.GetBlock(k,j);
#pragma omp target depend(in: *BlockA, *BlockB) \
depend(inout: *BlockC) \
map(to: BlockA[:BS*BS], BlockB[:BS*BS]) \
map(tofrom: BlockC[:BS*BS]) nowait
for(int ii = 0; ii < BS; ii++)
for(int jj = 0; jj < BS; jj++) {
for(int kk = 0; kk < BS; ++kk)
BlockC[ii + jj * BS] += BlockA[ii + kk * BS] * BlockB[kk + jj * BS];
}
}
}
}
As we carry out the multiplication of each block in the node, we have to
send the block of matrix A, the block of matrix B as input
(map(to: BlockA[:BS*BS], BlockB[:BS*BS])
), and the block of matrix C
as output and input (map(tofrom: BlockC[:BS*BS])
). The
multiplication process depends on input blocks A and B
(depend(in: BlockA[0], BlockB[0])
) and block C as output
(depend(inout: BlockC[0])
).
It is also possible to optimize the code even more by using a second level of parallelism within each
node using the traditionnal parallel for
directive as shown below:
void BlockMatMul(BlockMatrix &A, BlockMatrix &B, BlockMatrix &C) {
#pragma omp parallel
#pragma omp single
for (int i = 0; i < N / BS; ++i)
for (int j = 0; j < N / BS; ++j) {
float *BlockC = C.GetBlock(i, j);
for (int k = 0; k < N / BS; ++k) {
float *BlockA = A.GetBlock(i, k);
float *BlockB = B.GetBlock(k,j);
#pragma omp target depend(in: *BlockA, *BlockB) \
depend(inout: *BlockC) \
map(to: BlockA[:BS*BS], BlockB[:BS*BS]) \
map(tofrom: BlockC[:BS*BS]) nowait
#pragma omp parallel for
for(int ii = 0; ii < BS; ii++)
for(int jj = 0; jj < BS; jj++) {
for(int kk = 0; kk < BS; ++kk)
BlockC[ii + jj * BS] += BlockA[ii + kk * BS] * BlockB[kk + jj * BS];
}
}
}
}
The problem with this implementation is that the 3 blocks must be transmitted between the head and worker processes for each target task without any possibility for the runtime to improve the communication between the nodes.
To fix it, the input blocks can be sent previously using target data tasks
(target enter data map(...) depend(...) nowait
) and the resulting blocks
retrieved back using opposite target data tasks
(target exit data map(from: ...) depend(...) nowait
) at the end. In this case,
the OMPC scheduler and data manager will be able to optimize the mapping of the target tasks
on the worker processes and the communication between them. Here would be the resulting code:
void BlockMatMul(BlockMatrix &A, BlockMatrix &B, BlockMatrix &C) {
#pragma omp parallel
#pragma omp single
{
// Maps all matrices´ blocks asynchronously (as tasks).
for (int i = 0; i < N / BS; ++i) {
for (int j = 0; j < N / BS; ++j) {
float *BlockA = A.GetBlock(i, j);
#pragma omp target enter data map(to: BlockA[:BS*BS]) \
depend(out: *BlockA) nowait
float *BlockB = B.GetBlock(i, j);
#pragma omp target enter data map(to: BlockB[:BS*BS]) \
depend(out: *BlockB) nowait
float *BlockC = C.GetBlock(i, j);
#pragma omp target enter data map(to: BlockC[:BS*BS]) \
depend(out: *BlockC) nowait
}
}
for (int i = 0; i < N / BS; ++i)
for (int j = 0; j < N / BS; ++j) {
float *BlockC = C.GetBlock(i, j);
for (int k = 0; k < N / BS; ++k) {
float *BlockA = A.GetBlock(i, k);
float *BlockB = B.GetBlock(k,j);
// Submits the multiplication for the ijk-block
// Data is mapped implicitly and automatically moved by the runtime
#pragma omp target depend(in: *BlockA, *BlockB) \
depend(inout: *BlockC)
#pragma omp parallel for
for(int ii = 0; ii < BS; ii++)
for(int jj = 0; jj < BS; jj++) {
for(int kk = 0; kk < BS; ++kk)
BlockC[ii + jj * BS] += BlockA[ii + kk * BS] * BlockB[kk + jj * BS];
}
}
}
// Removes all matrices´ blocks and acquires the final result asynchronously.
for (int i = 0; i < N / BS; ++i) {
for (int j = 0; j < N / BS; ++j) {
float *BlockA = A.GetBlock(i, j);
#pragma omp target exit data map(release: BlockA[:BS*BS]) \
depend(inout: *BlockA) nowait
float *BlockB = B.GetBlock(i, j);
#pragma omp target exit data map(release: BlockB[:BS*BS]) \
depend(inout: *BlockB) nowait
float *BlockC = C.GetBlock(i, j);
#pragma omp target exit data map(from: BlockC[:BS*BS]) \
depend(inout: *BlockC) nowait
}
}
}
}
It is important to point out that every target tasks must use the first position of the block as a dependency. As said earlier, this is compulsory for the runtime to be able to keep track of the data used manage the communication between worker processes correctly.
More examples¶
More examples are available here: OMPC examples.
Currently, the following examples are available:
hello-world
fibonacci
reduce-sum
matmul
cholesky