Skip to content

Commit d24e102

Browse files
shiltiantstellar
authored andcommitted
[OpenMP] Fixed a crash when offloading to x86_64 with target nowait
PR#49334 reports a crash when offloading to x86_64 with `target nowait`, which is caused by referencing a nullptr. The root cause of the issue is, when pushing a hidden helper task in `__kmp_push_task`, it also maps the gtid to its shadow gtid, which is wrong. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D97329 (cherry picked from commit e5da63d)
1 parent 52510d8 commit d24e102

File tree

2 files changed

+150
-1
lines changed

2 files changed

+150
-1
lines changed
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu
2+
// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
3+
// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
4+
// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
5+
// RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda
6+
7+
#include <cassert>
8+
#include <iostream>
9+
#include <memory>
10+
#include <vector>
11+
12+
class BlockMatrix {
13+
private:
14+
const int rowsPerBlock;
15+
const int colsPerBlock;
16+
const long nRows;
17+
const long nCols;
18+
const int nBlocksPerRow;
19+
const int nBlocksPerCol;
20+
std::vector<std::vector<std::unique_ptr<float[]>>> Blocks;
21+
22+
public:
23+
BlockMatrix(const int _rowsPerBlock, const int _colsPerBlock,
24+
const long _nRows, const long _nCols)
25+
: rowsPerBlock(_rowsPerBlock), colsPerBlock(_colsPerBlock), nRows(_nRows),
26+
nCols(_nCols), nBlocksPerRow(_nRows / _rowsPerBlock),
27+
nBlocksPerCol(_nCols / _colsPerBlock), Blocks(nBlocksPerCol) {
28+
for (int i = 0; i < nBlocksPerCol; i++) {
29+
for (int j = 0; j < nBlocksPerRow; j++) {
30+
Blocks[i].emplace_back(new float[_rowsPerBlock * _colsPerBlock]);
31+
}
32+
}
33+
};
34+
35+
// Initialize the BlockMatrix from 2D arrays
36+
void Initialize(const std::vector<float> &matrix) {
37+
for (int i = 0; i < nBlocksPerCol; i++)
38+
for (int j = 0; j < nBlocksPerRow; j++) {
39+
float *CurrBlock = GetBlock(i, j);
40+
for (int ii = 0; ii < colsPerBlock; ++ii)
41+
for (int jj = 0; jj < rowsPerBlock; ++jj) {
42+
int curri = i * colsPerBlock + ii;
43+
int currj = j * rowsPerBlock + jj;
44+
CurrBlock[ii + jj * colsPerBlock] = matrix[curri + currj * nCols];
45+
}
46+
}
47+
}
48+
49+
long Compare(const std::vector<float> &matrix) const {
50+
long fail = 0;
51+
for (int i = 0; i < nBlocksPerCol; i++)
52+
for (int j = 0; j < nBlocksPerRow; j++) {
53+
float *CurrBlock = GetBlock(i, j);
54+
for (int ii = 0; ii < colsPerBlock; ++ii)
55+
for (int jj = 0; jj < rowsPerBlock; ++jj) {
56+
int curri = i * colsPerBlock + ii;
57+
int currj = j * rowsPerBlock + jj;
58+
float m_value = matrix[curri + currj * nCols];
59+
float bm_value = CurrBlock[ii + jj * colsPerBlock];
60+
if (bm_value != m_value) {
61+
fail++;
62+
}
63+
}
64+
}
65+
return fail;
66+
}
67+
68+
float *GetBlock(int i, int j) const {
69+
assert(i < nBlocksPerCol && j < nBlocksPerRow && "Accessing outside block");
70+
return Blocks[i][j].get();
71+
}
72+
};
73+
74+
constexpr const int BS = 256;
75+
constexpr const int N = 1024;
76+
77+
int BlockMatMul_TargetNowait(BlockMatrix &A, BlockMatrix &B, BlockMatrix &C) {
78+
#pragma omp parallel
79+
#pragma omp master
80+
for (int i = 0; i < N / BS; ++i)
81+
for (int j = 0; j < N / BS; ++j) {
82+
float *BlockC = C.GetBlock(i, j);
83+
for (int k = 0; k < N / BS; ++k) {
84+
float *BlockA = A.GetBlock(i, k);
85+
float *BlockB = B.GetBlock(k, j);
86+
// clang-format off
87+
#pragma omp target depend(in: BlockA[0], BlockB[0]) depend(inout: BlockC[0]) \
88+
map(to: BlockA[:BS * BS], BlockB[:BS * BS]) \
89+
map(tofrom: BlockC[:BS * BS]) nowait
90+
// clang-format on
91+
#pragma omp parallel for
92+
for (int ii = 0; ii < BS; ii++)
93+
for (int jj = 0; jj < BS; jj++) {
94+
for (int kk = 0; kk < BS; ++kk)
95+
BlockC[ii + jj * BS] +=
96+
BlockA[ii + kk * BS] * BlockB[kk + jj * BS];
97+
}
98+
}
99+
}
100+
return 0;
101+
}
102+
103+
void Matmul(const std::vector<float> &a, const std::vector<float> &b,
104+
std::vector<float> &c) {
105+
for (int i = 0; i < N; ++i) {
106+
for (int j = 0; j < N; ++j) {
107+
float sum = 0.0;
108+
for (int k = 0; k < N; ++k) {
109+
sum = sum + a[i * N + k] * b[k * N + j];
110+
}
111+
c[i * N + j] = sum;
112+
}
113+
}
114+
}
115+
116+
int main(int argc, char *argv[]) {
117+
std::vector<float> a(N * N);
118+
std::vector<float> b(N * N);
119+
std::vector<float> c(N * N, 0.0);
120+
121+
for (int i = 0; i < N; ++i) {
122+
for (int j = 0; j < N; ++j) {
123+
a[i * N + j] = b[i * N + j] = i + j % 100;
124+
}
125+
}
126+
127+
auto BlockedA = BlockMatrix(BS, BS, N, N);
128+
BlockedA.Initialize(a);
129+
BlockedA.Compare(a);
130+
auto BlockedB = BlockMatrix(BS, BS, N, N);
131+
BlockedB.Initialize(b);
132+
BlockedB.Compare(b);
133+
134+
Matmul(a, b, c);
135+
136+
auto BlockedC = BlockMatrix(BS, BS, N, N);
137+
BlockMatMul_TargetNowait(BlockedA, BlockedB, BlockedC);
138+
139+
if (BlockedC.Compare(c) > 0) {
140+
return 1;
141+
}
142+
143+
std::cout << "PASS\n";
144+
145+
return 0;
146+
}
147+
148+
// CHECK: PASS

openmp/runtime/src/kmp_tasking.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,8 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
326326
kmp_info_t *thread = __kmp_threads[gtid];
327327
kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
328328

329-
if (taskdata->td_flags.hidden_helper) {
329+
// We don't need to map to shadow gtid if it is already hidden helper thread
330+
if (taskdata->td_flags.hidden_helper && !KMP_HIDDEN_HELPER_THREAD(gtid)) {
330331
gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
331332
thread = __kmp_threads[gtid];
332333
}

0 commit comments

Comments
 (0)