Skip to content

Commit 8ad3b9d

Browse files
authored
Merge pull request intel#1504 from bb-sycl/xmain
Auto pulldown and update tc files for xmain branch on 20230103
2 parents 13b86ce + 6d60d16 commit 8ad3b9d

File tree

221 files changed

+7183
-418
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

221 files changed

+7183
-418
lines changed

External/CUDA/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ set(SUPPORTED_GPU_CUDA_11_3 ${SUPPORTED_GPU_CUDA_11_2})
3737
set(SUPPORTED_GPU_CUDA_11_4 ${SUPPORTED_GPU_CUDA_11_3})
3838
set(SUPPORTED_GPU_CUDA_11_5 ${SUPPORTED_GPU_CUDA_11_4})
3939
set(SUPPORTED_GPU_CUDA_11_6 ${SUPPORTED_GPU_CUDA_11_5})
40+
set(SUPPORTED_GPU_CUDA_11_7 ${SUPPORTED_GPU_CUDA_11_6})
41+
set(SUPPORTED_GPU_CUDA_11_8 ${SUPPORTED_GPU_CUDA_11_7}
42+
sm_89 sm_90)
4043

4144
set(CUDA_NEW_DRIVER ON CACHE BOOL "Use the new Clang offloading Driver")
4245

External/SPEC/CINT2017rate/525.x264_r/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ foreach (_file IN LISTS ldecod_SourceNames)
181181
endforeach ()
182182
llvm_add_host_executable(ldecod_${SUFFIX}-host
183183
ldecod_${SUFFIX} ${ldecod_Sources}
184-
CPPFLAGS -I "${SRC_DIR}/ldecod_src/inc" -DSPEC
184+
CPPFLAGS -I "${SRC_DIR}/ldecod_src/inc" -DSPEC -fcommon
185185
LDFLAGS -lm
186186
)
187187

Lines changed: 142 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
// This program tests vectorized truncates & zero-extends for performance and
2+
// correctness
13
#include <iostream>
24
#include <memory>
35
#include <random>
@@ -11,70 +13,178 @@ static std::mt19937 rng;
1113
// Initialize array A with random numbers.
1214
template <typename Ty>
1315
static void init_data(const std::unique_ptr<Ty[]> &A, unsigned N) {
14-
std::uniform_int_distribution<uint64_t> distrib(
15-
std::numeric_limits<Ty>::min(), std::numeric_limits<Ty>::max());
16-
for (unsigned i = 0; i < N; i++)
17-
A[i] = static_cast<Ty>(distrib(rng));
16+
std::uniform_int_distribution<Ty> distrib(std::numeric_limits<Ty>::min(),
17+
std::numeric_limits<Ty>::max());
18+
for (unsigned I = 0; I < N; I++)
19+
A[I] = distrib(rng);
20+
}
21+
22+
// Truncate/Zero-extend elements to create expected results with no
23+
// vectorization
24+
template <typename Ty1, typename Ty2>
25+
static void truncOrZextWithNoVec(const Ty1 *A, Ty2 *B, int Iterations) {
26+
#pragma clang loop vectorize(disable)
27+
for (unsigned I = 0; I < Iterations; I++) {
28+
B[I] = A[I];
29+
}
1830
}
1931

2032
// Truncate/Zero-extend each vector element in a vectorized loop with vectorization width 8
21-
template <typename Ty1, typename Ty2> static void truncOrZextVecInLoopWithVW8(const Ty1 *A, Ty2 *B, int iterations) {
33+
template <typename Ty1, typename Ty2>
34+
static void truncOrZextVecInLoopWithVW8(const Ty1 *A, Ty2 *B, int Iterations) {
2235
#pragma clang loop vectorize_width(8) interleave_count(4)
23-
for (unsigned i = 0; i < iterations; i++) {
24-
B[i] = A[i];
36+
for (unsigned I = 0; I < Iterations; I++) {
37+
B[I] = A[I];
38+
}
39+
}
40+
41+
// Truncate/Zero-extend each vector element in a vectorized loop with
42+
// vectorization width 16
43+
template <typename Ty1, typename Ty2>
44+
static void truncOrZextVecInLoopWithVW16(const Ty1 *A, Ty2 *B, int Iterations) {
45+
#pragma clang loop vectorize_width(16) interleave_count(4)
46+
for (unsigned I = 0; I < Iterations; I++) {
47+
B[I] = A[I];
48+
}
49+
}
50+
51+
// Truncate/Zero-extend each vector element in a vectorized loop
52+
template <typename Ty1, typename Ty2>
53+
static void truncOrZextVecInLoop(const Ty1 *A, Ty2 *B, int Iterations) {
54+
#pragma clang loop vectorize(enable)
55+
for (unsigned I = 0; I < Iterations; I++) {
56+
B[I] = A[I];
57+
}
58+
}
59+
60+
// Truncate/Zero-extend each vector element while adding in a vectorized loop
61+
// with vectorization width 8
62+
template <typename Ty1, typename Ty2>
63+
static void truncOrZextVecWithAddInLoopWithVW8(const Ty1 *A, Ty2 *B,
64+
int Iterations) {
65+
#pragma clang loop vectorize_width(8) interleave_count(4)
66+
for (unsigned I = 0; I < Iterations; I++) {
67+
B[I] += A[I];
68+
}
69+
}
70+
71+
// Truncate/Zero-extend each vector element while adding in a vectorized loop
72+
// vectorization width 16
73+
template <typename Ty1, typename Ty2>
74+
static void truncOrZextVecWithAddInLoopWithVW16(const Ty1 *A, Ty2 *B,
75+
int Iterations) {
76+
#pragma clang loop vectorize_width(16) interleave_count(4)
77+
for (unsigned I = 0; I < Iterations; I++) {
78+
B[I] += A[I];
79+
}
80+
}
81+
82+
// Truncate/Zero-extend each vector element while adding in a vectorized loop
83+
template <typename Ty1, typename Ty2>
84+
static void truncOrZextVecWithAddInLoop(const Ty1 *A, Ty2 *B, int Iterations) {
85+
#pragma clang loop vectorize(enable)
86+
for (unsigned I = 0; I < Iterations; I++) {
87+
B[I] += A[I];
2588
}
2689
}
2790

28-
template <typename Ty1, typename Ty2> static void __attribute__((always_inline))
29-
benchForTruncOrZextVecInLoopWithVW8(benchmark::State &state) {
91+
template <typename Ty1, typename Ty2>
92+
static void __attribute__((always_inline))
93+
benchForTruncOrZextVecInLoop(benchmark::State &state,
94+
void (*Fn)(const Ty1 *, Ty2 *, int)) {
3095
std::unique_ptr<Ty1[]> A(new Ty1[ITERATIONS]);
3196
std::unique_ptr<Ty2[]> B(new Ty2[ITERATIONS]);
97+
std::unique_ptr<Ty2[]> C(new Ty2[ITERATIONS]);
98+
3299
init_data(A, ITERATIONS);
33-
init_data(B, ITERATIONS);
100+
101+
// Check for correctness
102+
truncOrZextWithNoVec(&A[0], &C[0], ITERATIONS);
103+
Fn(&A[0], &B[0], ITERATIONS);
104+
for (int I = 0; I < ITERATIONS; I++) {
105+
if (B[I] != C[I]) {
106+
std::cerr << "ERROR: Trunc or ZExt operation on " << A[I]
107+
<< " is showing result " << B[I] << " instead of " << C[I]
108+
<< "\n";
109+
exit(1);
110+
}
111+
}
112+
34113
for (auto _ : state) {
35114
benchmark::DoNotOptimize(B);
36115
benchmark::ClobberMemory();
37-
truncOrZextVecInLoopWithVW8(&A[0], &B[0], ITERATIONS);
38-
}
39-
}
40-
41-
// Truncate/Zero-extend each vector element in a vectorized loop
42-
template <typename Ty1, typename Ty2> static void truncOrZextVecInLoop(const Ty1 *A, Ty2 *B, int iterations) {
43-
#pragma clang loop interleave_count(4)
44-
for (unsigned i = 0; i < iterations; i++) {
45-
B[i] = A[i];
116+
Fn(&A[0], &B[0], ITERATIONS);
46117
}
47118
}
48119

49-
template <typename Ty1, typename Ty2> static void __attribute__((always_inline))
50-
benchForTruncOrZextVecInLoop(benchmark::State &state) {
120+
template <typename Ty1, typename Ty2>
121+
static void __attribute__((always_inline))
122+
benchForTruncOrZextVecWithAddInLoop(benchmark::State &state,
123+
void (*Fn)(const Ty1 *, Ty2 *, int)) {
51124
std::unique_ptr<Ty1[]> A(new Ty1[ITERATIONS]);
52125
std::unique_ptr<Ty2[]> B(new Ty2[ITERATIONS]);
126+
std::unique_ptr<Ty2[]> C(new Ty2[ITERATIONS]);
53127
init_data(A, ITERATIONS);
54128
init_data(B, ITERATIONS);
55129
for (auto _ : state) {
56130
benchmark::DoNotOptimize(B);
57131
benchmark::ClobberMemory();
58-
truncOrZextVecInLoop(&A[0], &B[0], ITERATIONS);
132+
Fn(&A[0], &B[0], ITERATIONS);
59133
}
60134
}
61135

62136
// Add vectorized truncate or zero-extend operation benchmarks for different element types
63-
#define ADD_BENCHMARK(ty1, ty2) \
64-
void benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_(benchmark::State &state) { \
65-
benchForTruncOrZextVecInLoopWithVW8<ty1, ty2>(state); \
137+
#define ADD_BENCHMARK(ty1, ty2) \
138+
void benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_( \
139+
benchmark::State &state) { \
140+
benchForTruncOrZextVecInLoop<ty1, ty2>(state, \
141+
&truncOrZextVecInLoopWithVW8); \
66142
} \
67-
BENCHMARK(benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_); \
68-
void benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_(benchmark::State &state) { \
69-
benchForTruncOrZextVecInLoop<ty1, ty2>(state); \
143+
BENCHMARK(benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_); \
144+
void benchForTruncOrZextVecInLoopWithVW16From_##ty1##_To_##ty2##_( \
145+
benchmark::State &state) { \
146+
benchForTruncOrZextVecInLoop<ty1, ty2>(state, \
147+
&truncOrZextVecInLoopWithVW16); \
70148
} \
71-
BENCHMARK(benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_); \
149+
BENCHMARK(benchForTruncOrZextVecInLoopWithVW16From_##ty1##_To_##ty2##_); \
150+
void benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_( \
151+
benchmark::State &state) { \
152+
benchForTruncOrZextVecInLoop<ty1, ty2>(state, &truncOrZextVecInLoop); \
153+
} \
154+
BENCHMARK(benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_); \
155+
void benchForTruncOrZextVecWithAddInLoopWithVW8From_##ty1##_To_##ty2##_( \
156+
benchmark::State &state) { \
157+
benchForTruncOrZextVecWithAddInLoop<ty1, ty2>( \
158+
state, &truncOrZextVecWithAddInLoopWithVW8); \
159+
} \
160+
BENCHMARK( \
161+
benchForTruncOrZextVecWithAddInLoopWithVW8From_##ty1##_To_##ty2##_); \
162+
void benchForTruncOrZextVecWithAddInLoopWithVW16From_##ty1##_To_##ty2##_( \
163+
benchmark::State &state) { \
164+
benchForTruncOrZextVecWithAddInLoop<ty1, ty2>( \
165+
state, &truncOrZextVecWithAddInLoopWithVW16); \
166+
} \
167+
BENCHMARK( \
168+
benchForTruncOrZextVecWithAddInLoopWithVW16From_##ty1##_To_##ty2##_); \
169+
void benchForTruncOrZextVecWithAddInLoopFrom_##ty1##_To_##ty2##_( \
170+
benchmark::State &state) { \
171+
benchForTruncOrZextVecWithAddInLoop<ty1, ty2>( \
172+
state, &truncOrZextVecWithAddInLoop); \
173+
} \
174+
BENCHMARK(benchForTruncOrZextVecWithAddInLoopFrom_##ty1##_To_##ty2##_);
72175

73176
/* Vectorized truncate operations */
74-
ADD_BENCHMARK(uint64_t, uint8_t)
75-
ADD_BENCHMARK(uint32_t, uint8_t)
76177
ADD_BENCHMARK(uint16_t, uint8_t)
77-
178+
ADD_BENCHMARK(uint32_t, uint8_t)
179+
ADD_BENCHMARK(uint64_t, uint8_t)
180+
ADD_BENCHMARK(uint32_t, uint16_t)
181+
ADD_BENCHMARK(uint64_t, uint16_t)
182+
ADD_BENCHMARK(uint64_t, uint32_t)
78183

79184
/* Vectorized zero extend operations */
185+
ADD_BENCHMARK(uint8_t, uint16_t)
80186
ADD_BENCHMARK(uint8_t, uint32_t)
187+
ADD_BENCHMARK(uint8_t, uint64_t)
188+
ADD_BENCHMARK(uint16_t, uint32_t)
189+
ADD_BENCHMARK(uint16_t, uint64_t)
190+
ADD_BENCHMARK(uint32_t, uint64_t)

MultiSource/Applications/oggenc/oggenc.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@
88
* Portions from Vorbize, (c) Kenneth Arnold <[email protected]>
99
* and libvorbis examples, (c) Monty <[email protected]>
1010
*/
11+
#if __has_include(<alloca.h>)
1112
#include <alloca.h>
13+
#endif
1214
#include <assert.h>
1315
#include <ctype.h>
1416
#include <errno.h>

MultiSource/Benchmarks/MiBench/consumer-lame/rtp.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@
44
#include <unistd.h>
55
#include <stdlib.h>
66
#include <stdio.h>
7+
#if __has_include(<alloca.h>)
78
#include <alloca.h>
9+
#endif
810
#include <sys/types.h>
911
#include <sys/socket.h>
1012
#include <arpa/inet.h>

MultiSource/Benchmarks/TSVC/tsc.inc

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3760,14 +3760,14 @@ int s318( int inc)
37603760
for (int nl = 0; nl < ntimes/2; nl++) {
37613761
k = 0;
37623762
index = 0;
3763-
max = abs(a[0]);
3763+
max = FABS(a[0]);
37643764
k += inc;
37653765
for (int i = 1; i < LEN; i++) {
3766-
if (abs(a[k]) <= max) {
3766+
if (FABS(a[k]) <= max) {
37673767
goto L5;
37683768
}
37693769
index = i;
3770-
max = abs(a[k]);
3770+
max = FABS(a[k]);
37713771
L5:
37723772
k += inc;
37733773
}
@@ -3971,10 +3971,10 @@ int s3113()
39713971

39723972
TYPE max;
39733973
for (int nl = 0; nl < ntimes*4; nl++) {
3974-
max = abs(a[0]);
3974+
max = FABS(a[0]);
39753975
for (int i = 0; i < LEN; i++) {
3976-
if ((abs(a[i])) > max) {
3977-
max = abs(a[i]);
3976+
if ((FABS(a[i])) > max) {
3977+
max = FABS(a[i]);
39783978
}
39793979
}
39803980
dummy(a, b, c, d, e, aa, bb, cc, max);

MultiSource/Benchmarks/TSVC/types.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,14 @@
55
#define LEN2 256
66

77
#ifndef TYPE
8-
#define TYPE float
8+
#define TYPE float
9+
#define FABS(x) fabsf(x)
10+
#else
11+
#define FABS(x) fabs(x)
912
#endif
1013

1114
#ifndef X_TYPE
12-
#define X_TYPE TYPE
15+
#define X_TYPE TYPE
1316
#endif
1417

1518
#ifndef ALIGNMENT

SYCL/Assert/assert_in_kernels.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
11
// REQUIRES: linux
2+
3+
// https://github.com/intel/llvm/issues/7634
4+
// UNSUPPORTED: hip
5+
26
// RUN: %clangxx -DSYCL_FALLBACK_ASSERT=1 -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
3-
// RUN: %CPU_RUN_PLACEHOLDER %t.out &> %t.txt || true
4-
// RUN: %CPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.txt
5-
// RUN: %GPU_RUN_PLACEHOLDER %t.out &> %t.txt || true
6-
// RUN: %GPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.txt
7+
// RUN: %CPU_RUN_PLACEHOLDER %t.out &> %t.cpu.txt || true
8+
// RUN: %CPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.cpu.txt
9+
// RUN: %GPU_RUN_PLACEHOLDER %t.out &> %t.gpu.txt || true
10+
// RUN: %GPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.gpu.txt
711
// Shouldn't fail on ACC as fallback assert isn't enqueued there
8-
// RUN: %ACC_RUN_PLACEHOLDER %t.out &> %t.txt
9-
// RUN: %ACC_RUN_PLACEHOLDER FileCheck %s --check-prefix=CHECK-ACC --input-file %t.txt
12+
// RUN: %ACC_RUN_PLACEHOLDER %t.out &> %t.acc.txt
13+
// RUN: %ACC_RUN_PLACEHOLDER FileCheck %s --check-prefix=CHECK-ACC --input-file %t.acc.txt
1014
//
1115
// CHECK-NOT: One shouldn't see this message
1216
// CHECK: {{.*}}assert_in_kernels.hpp:25: void kernelFunc2(int *, int): {{.*}} [{{[0,2]}},0,0], {{.*}} [0,0,0]

SYCL/Assert/assert_in_multiple_tus.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
11
// REQUIRES: linux
2+
3+
// https://github.com/intel/llvm/issues/7634
4+
// UNSUPPORTED: hip
5+
26
// RUN: %clangxx -DSYCL_FALLBACK_ASSERT=1 -fsycl -fsycl-targets=%sycl_triple -I %S/Inputs %s %S/Inputs/kernels_in_file2.cpp -o %t.out
3-
// RUN: %CPU_RUN_PLACEHOLDER %t.out &> %t.txt || true
4-
// RUN: %CPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.txt
5-
// RUN: %GPU_RUN_PLACEHOLDER %t.out &> %t.txt || true
6-
// RUN: %GPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.txt
7+
// RUN: %CPU_RUN_PLACEHOLDER %t.out &> %t.cpu.txt || true
8+
// RUN: %CPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.cpu.txt
9+
// RUN: %GPU_RUN_PLACEHOLDER %t.out &> %t.gpu.txt || true
10+
// RUN: %GPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.gpu.txt
711
// Shouldn't fail on ACC as fallback assert isn't enqueued there
8-
// RUN: %ACC_RUN_PLACEHOLDER %t.out &> %t.txt
9-
// RUN: %ACC_RUN_PLACEHOLDER FileCheck %s --check-prefix=CHECK-ACC --input-file %t.txt
12+
// RUN: %ACC_RUN_PLACEHOLDER %t.out &> %t.acc.txt
13+
// RUN: %ACC_RUN_PLACEHOLDER FileCheck %s --check-prefix=CHECK-ACC --input-file %t.acc.txt
1014
//
1115
// CUDA uses block/thread vs global/local id for SYCL, also it shows the
1216
// position of a thread within the block, not the absolute ID.

SYCL/Assert/assert_in_multiple_tus_one_ndebug.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
11
// REQUIRES: linux
2+
3+
// https://github.com/intel/llvm/issues/7634
4+
// UNSUPPORTED: hip
5+
26
// RUN: %clangxx -DSYCL_FALLBACK_ASSERT=1 -fsycl -fsycl-targets=%sycl_triple -DDEFINE_NDEBUG_INFILE2 -I %S/Inputs %S/assert_in_multiple_tus.cpp %S/Inputs/kernels_in_file2.cpp -o %t.out
3-
// RUN: %CPU_RUN_PLACEHOLDER %t.out &> %t.txt || true
4-
// RUN: %CPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.txt
5-
// RUN: %GPU_RUN_PLACEHOLDER %t.out &> %t.txt || true
6-
// RUN: %GPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.txt
7+
// RUN: %CPU_RUN_PLACEHOLDER %t.out &> %t.cpu.txt || true
8+
// RUN: %CPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.cpu.txt
9+
// RUN: %GPU_RUN_PLACEHOLDER %t.out &> %t.gpu.txt || true
10+
// RUN: %GPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.gpu.txt
711
// Shouldn't fail on ACC as fallback assert isn't enqueued there
8-
// RUN: %ACC_RUN_PLACEHOLDER %t.out &> %t.txt
9-
// RUN: %ACC_RUN_PLACEHOLDER FileCheck %s --check-prefix=CHECK-ACC --input-file %t.txt
12+
// RUN: %ACC_RUN_PLACEHOLDER %t.out &> %t.acc.txt
13+
// RUN: %ACC_RUN_PLACEHOLDER FileCheck %s --check-prefix=CHECK-ACC --input-file %t.acc.txt
1014
//
1115
// CHECK-NOT: this message from calculus
1216
// CUDA uses block/thread vs global/local id for SYCL, also it shows the

SYCL/Assert/assert_in_one_kernel.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
11
// REQUIRES: linux
2+
3+
// https://github.com/intel/llvm/issues/7634
4+
// UNSUPPORTED: hip
5+
26
// RUN: %clangxx -DSYCL_FALLBACK_ASSERT=1 -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
3-
// RUN: %CPU_RUN_PLACEHOLDER %t.out &> %t.txt || true
4-
// RUN: %CPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.txt
5-
// RUN: %GPU_RUN_PLACEHOLDER %t.out &> %t.txt || true
6-
// RUN: %GPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.txt
7+
// RUN: %CPU_RUN_PLACEHOLDER %t.out &> %t.cpu.txt || true
8+
// RUN: %CPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.cpu.txt
9+
// RUN: %GPU_RUN_PLACEHOLDER %t.out &> %t.gpu.txt || true
10+
// RUN: %GPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.gpu.txt
711
// Shouldn't fail on ACC as fallback assert isn't enqueued there
8-
// RUN: %ACC_RUN_PLACEHOLDER %t.out &> %t.txt
9-
// RUN: %ACC_RUN_PLACEHOLDER FileCheck %s --check-prefix=CHECK-ACC --input-file %t.txt
12+
// RUN: %ACC_RUN_PLACEHOLDER %t.out &> %t.acc.txt
13+
// RUN: %ACC_RUN_PLACEHOLDER FileCheck %s --check-prefix=CHECK-ACC --input-file %t.acc.txt
1014
//
1115
// CHECK: {{.*}}assert_in_one_kernel.hpp:10: void kernelFunc(int *, int): {{.*}} [{{[0-3]}},0,0], {{.*}} [0,0,0]
1216
// CHECK-SAME: Assertion `Buf[wiID] != 0 && "from assert statement"` failed.

0 commit comments

Comments
 (0)