myler
diff --git a/‎External/CUDA/CMakeLists.txt
Lines changed: 3 additions & 0 deletions b/‎External/CUDA/CMakeLists.txt
Lines changed: 3 additions & 0 deletions
diff --git a/‎External/SPEC/CINT2017rate/525.x264_r/CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎External/SPEC/CINT2017rate/525.x264_r/CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎MicroBenchmarks/LoopVectorization/VectorOperations.cpp
Lines changed: 142 additions & 32 deletions b/‎MicroBenchmarks/LoopVectorization/VectorOperations.cpp
Lines changed: 142 additions & 32 deletions
diff --git a/‎MultiSource/Applications/oggenc/oggenc.c
Lines changed: 2 additions & 0 deletions b/‎MultiSource/Applications/oggenc/oggenc.c
Lines changed: 2 additions & 0 deletions
diff --git a/‎MultiSource/Benchmarks/MiBench/consumer-lame/rtp.c
Lines changed: 2 additions & 0 deletions b/‎MultiSource/Benchmarks/MiBench/consumer-lame/rtp.c
Lines changed: 2 additions & 0 deletions
diff --git a/‎MultiSource/Benchmarks/TSVC/tsc.inc
Lines changed: 6 additions & 6 deletions b/‎MultiSource/Benchmarks/TSVC/tsc.inc
Lines changed: 6 additions & 6 deletions
diff --git a/‎MultiSource/Benchmarks/TSVC/types.h
Lines changed: 5 additions & 2 deletions b/‎MultiSource/Benchmarks/TSVC/types.h
Lines changed: 5 additions & 2 deletions
diff --git a/‎SYCL/Assert/assert_in_kernels.cpp
Lines changed: 10 additions & 6 deletions b/‎SYCL/Assert/assert_in_kernels.cpp
Lines changed: 10 additions & 6 deletions
diff --git a/‎SYCL/Assert/assert_in_multiple_tus.cpp
Lines changed: 10 additions & 6 deletions b/‎SYCL/Assert/assert_in_multiple_tus.cpp
Lines changed: 10 additions & 6 deletions
diff --git a/‎SYCL/Assert/assert_in_multiple_tus_one_ndebug.cpp
Lines changed: 10 additions & 6 deletions b/‎SYCL/Assert/assert_in_multiple_tus_one_ndebug.cpp
Lines changed: 10 additions & 6 deletions
diff --git a/‎SYCL/Assert/assert_in_one_kernel.cpp
Lines changed: 10 additions & 6 deletions b/‎SYCL/Assert/assert_in_one_kernel.cpp
Lines changed: 10 additions & 6 deletions
@@ -37,6 +37,9 @@ set(SUPPORTED_GPU_CUDA_11_3 ${SUPPORTED_GPU_CUDA_11_2})
 set(SUPPORTED_GPU_CUDA_11_4 ${SUPPORTED_GPU_CUDA_11_3})
 set(SUPPORTED_GPU_CUDA_11_5 ${SUPPORTED_GPU_CUDA_11_4})
 set(SUPPORTED_GPU_CUDA_11_6 ${SUPPORTED_GPU_CUDA_11_5})
+set(SUPPORTED_GPU_CUDA_11_7 ${SUPPORTED_GPU_CUDA_11_6})
+set(SUPPORTED_GPU_CUDA_11_8 ${SUPPORTED_GPU_CUDA_11_7}
+  sm_89 sm_90)
 
 set(CUDA_NEW_DRIVER ON CACHE BOOL "Use the new Clang offloading Driver")
 
 
@@ -181,7 +181,7 @@ foreach (_file IN LISTS ldecod_SourceNames)
 endforeach ()
 llvm_add_host_executable(ldecod_${SUFFIX}-host
   ldecod_${SUFFIX} ${ldecod_Sources}
-  CPPFLAGS -I "${SRC_DIR}/ldecod_src/inc" -DSPEC
+  CPPFLAGS -I "${SRC_DIR}/ldecod_src/inc" -DSPEC -fcommon
   LDFLAGS -lm
 )
 
 
@@ -1,3 +1,5 @@
+// This program tests vectorized truncates & zero-extends for performance and
+// correctness
 #include <iostream>
 #include <memory>
 #include <random>
@@ -11,70 +13,178 @@ static std::mt19937 rng;
 // Initialize array A with random numbers.
 template <typename Ty>
 static void init_data(const std::unique_ptr<Ty[]> &A, unsigned N) {
-  std::uniform_int_distribution<uint64_t> distrib(
-      std::numeric_limits<Ty>::min(), std::numeric_limits<Ty>::max());
-  for (unsigned i = 0; i < N; i++)
-    A[i] = static_cast<Ty>(distrib(rng));
+  std::uniform_int_distribution<Ty> distrib(std::numeric_limits<Ty>::min(),
+                                            std::numeric_limits<Ty>::max());
+  for (unsigned I = 0; I < N; I++)
+    A[I] = distrib(rng);
+}
+
+// Truncate/Zero-extend elements to create expected results with no
+// vectorization
+template <typename Ty1, typename Ty2>
+static void truncOrZextWithNoVec(const Ty1 *A, Ty2 *B, int Iterations) {
+#pragma clang loop vectorize(disable)
+  for (unsigned I = 0; I < Iterations; I++) {
+    B[I] = A[I];
+  }
 }
 
 // Truncate/Zero-extend each vector element in a vectorized loop with vectorization width 8
-template <typename Ty1, typename Ty2> static void truncOrZextVecInLoopWithVW8(const Ty1 *A, Ty2 *B, int iterations) {
+template <typename Ty1, typename Ty2>
+static void truncOrZextVecInLoopWithVW8(const Ty1 *A, Ty2 *B, int Iterations) {
 #pragma clang loop vectorize_width(8) interleave_count(4)
-  for (unsigned i = 0; i < iterations; i++) {
-    B[i] = A[i];
+  for (unsigned I = 0; I < Iterations; I++) {
+    B[I] = A[I];
+  }
+}
+
+// Truncate/Zero-extend each vector element in a vectorized loop with
+// vectorization width 16
+template <typename Ty1, typename Ty2>
+static void truncOrZextVecInLoopWithVW16(const Ty1 *A, Ty2 *B, int Iterations) {
+#pragma clang loop vectorize_width(16) interleave_count(4)
+  for (unsigned I = 0; I < Iterations; I++) {
+    B[I] = A[I];
+  }
+}
+
+// Truncate/Zero-extend each vector element in a vectorized loop
+template <typename Ty1, typename Ty2>
+static void truncOrZextVecInLoop(const Ty1 *A, Ty2 *B, int Iterations) {
+#pragma clang loop vectorize(enable)
+  for (unsigned I = 0; I < Iterations; I++) {
+    B[I] = A[I];
+  }
+}
+
+// Truncate/Zero-extend each vector element while adding in a vectorized loop
+// with vectorization width 8
+template <typename Ty1, typename Ty2>
+static void truncOrZextVecWithAddInLoopWithVW8(const Ty1 *A, Ty2 *B,
+                                               int Iterations) {
+#pragma clang loop vectorize_width(8) interleave_count(4)
+  for (unsigned I = 0; I < Iterations; I++) {
+    B[I] += A[I];
+  }
+}
+
+// Truncate/Zero-extend each vector element while adding in a vectorized loop
+// vectorization width 16
+template <typename Ty1, typename Ty2>
+static void truncOrZextVecWithAddInLoopWithVW16(const Ty1 *A, Ty2 *B,
+                                                int Iterations) {
+#pragma clang loop vectorize_width(16) interleave_count(4)
+  for (unsigned I = 0; I < Iterations; I++) {
+    B[I] += A[I];
+  }
+}
+
+// Truncate/Zero-extend each vector element while adding in a vectorized loop
+template <typename Ty1, typename Ty2>
+static void truncOrZextVecWithAddInLoop(const Ty1 *A, Ty2 *B, int Iterations) {
+#pragma clang loop vectorize(enable)
+  for (unsigned I = 0; I < Iterations; I++) {
+    B[I] += A[I];
   }
 }
 
-template <typename Ty1, typename Ty2> static void __attribute__((always_inline))
-benchForTruncOrZextVecInLoopWithVW8(benchmark::State &state) {
+template <typename Ty1, typename Ty2>
+static void __attribute__((always_inline))
+benchForTruncOrZextVecInLoop(benchmark::State &state,
+                             void (*Fn)(const Ty1 *, Ty2 *, int)) {
   std::unique_ptr<Ty1[]> A(new Ty1[ITERATIONS]);
   std::unique_ptr<Ty2[]> B(new Ty2[ITERATIONS]);
+  std::unique_ptr<Ty2[]> C(new Ty2[ITERATIONS]);
+
   init_data(A, ITERATIONS);
-  init_data(B, ITERATIONS);
+
+  // Check for correctness
+  truncOrZextWithNoVec(&A[0], &C[0], ITERATIONS);
+  Fn(&A[0], &B[0], ITERATIONS);
+  for (int I = 0; I < ITERATIONS; I++) {
+    if (B[I] != C[I]) {
+      std::cerr << "ERROR: Trunc or ZExt operation on " << A[I]
+                << " is showing result " << B[I] << " instead of " << C[I]
+                << "\n";
+      exit(1);
+    }
+  }
+
   for (auto _ : state) {
     benchmark::DoNotOptimize(B);
     benchmark::ClobberMemory();
-    truncOrZextVecInLoopWithVW8(&A[0], &B[0], ITERATIONS);
-  }
-}
-
-// Truncate/Zero-extend each vector element in a vectorized loop
-template <typename Ty1, typename Ty2> static void truncOrZextVecInLoop(const Ty1 *A, Ty2 *B, int iterations) {
-#pragma clang loop interleave_count(4)
-  for (unsigned i = 0; i < iterations; i++) {
-    B[i] = A[i];
+    Fn(&A[0], &B[0], ITERATIONS);
   }
 }
 
-template <typename Ty1, typename Ty2> static void __attribute__((always_inline))
-benchForTruncOrZextVecInLoop(benchmark::State &state) {
+template <typename Ty1, typename Ty2>
+static void __attribute__((always_inline))
+benchForTruncOrZextVecWithAddInLoop(benchmark::State &state,
+                                    void (*Fn)(const Ty1 *, Ty2 *, int)) {
   std::unique_ptr<Ty1[]> A(new Ty1[ITERATIONS]);
   std::unique_ptr<Ty2[]> B(new Ty2[ITERATIONS]);
+  std::unique_ptr<Ty2[]> C(new Ty2[ITERATIONS]);
   init_data(A, ITERATIONS);
   init_data(B, ITERATIONS);
   for (auto _ : state) {
     benchmark::DoNotOptimize(B);
     benchmark::ClobberMemory();
-    truncOrZextVecInLoop(&A[0], &B[0], ITERATIONS);
+    Fn(&A[0], &B[0], ITERATIONS);
   }
 }
 
 // Add vectorized truncate or zero-extend operation benchmarks for different element types
-#define ADD_BENCHMARK(ty1, ty2)                                                  \
-  void benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_(benchmark::State &state) {             \
-    benchForTruncOrZextVecInLoopWithVW8<ty1, ty2>(state);                   \
+#define ADD_BENCHMARK(ty1, ty2)                                                \
+  void benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_(            \
+      benchmark::State &state) {                                               \
+    benchForTruncOrZextVecInLoop<ty1, ty2>(state,                              \
+                                           &truncOrZextVecInLoopWithVW8);      \
   }                                                                            \
-  BENCHMARK(benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_); \
-  void benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_(benchmark::State &state) {             \
-    benchForTruncOrZextVecInLoop<ty1, ty2>(state);                   \
+  BENCHMARK(benchForTruncOrZextVecInLoopWithVW8From_##ty1##_To_##ty2##_);      \
+  void benchForTruncOrZextVecInLoopWithVW16From_##ty1##_To_##ty2##_(           \
+      benchmark::State &state) {                                               \
+    benchForTruncOrZextVecInLoop<ty1, ty2>(state,                              \
+                                           &truncOrZextVecInLoopWithVW16);     \
   }                                                                            \
-  BENCHMARK(benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_); \
+  BENCHMARK(benchForTruncOrZextVecInLoopWithVW16From_##ty1##_To_##ty2##_);     \
+  void benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_(                   \
+      benchmark::State &state) {                                               \
+    benchForTruncOrZextVecInLoop<ty1, ty2>(state, &truncOrZextVecInLoop);      \
+  }                                                                            \
+  BENCHMARK(benchForTruncOrZextVecInLoopFrom_##ty1##_To_##ty2##_);             \
+  void benchForTruncOrZextVecWithAddInLoopWithVW8From_##ty1##_To_##ty2##_(     \
+      benchmark::State &state) {                                               \
+    benchForTruncOrZextVecWithAddInLoop<ty1, ty2>(                             \
+        state, &truncOrZextVecWithAddInLoopWithVW8);                           \
+  }                                                                            \
+  BENCHMARK(                                                                   \
+      benchForTruncOrZextVecWithAddInLoopWithVW8From_##ty1##_To_##ty2##_);     \
+  void benchForTruncOrZextVecWithAddInLoopWithVW16From_##ty1##_To_##ty2##_(    \
+      benchmark::State &state) {                                               \
+    benchForTruncOrZextVecWithAddInLoop<ty1, ty2>(                             \
+        state, &truncOrZextVecWithAddInLoopWithVW16);                          \
+  }                                                                            \
+  BENCHMARK(                                                                   \
+      benchForTruncOrZextVecWithAddInLoopWithVW16From_##ty1##_To_##ty2##_);    \
+  void benchForTruncOrZextVecWithAddInLoopFrom_##ty1##_To_##ty2##_(            \
+      benchmark::State &state) {                                               \
+    benchForTruncOrZextVecWithAddInLoop<ty1, ty2>(                             \
+        state, &truncOrZextVecWithAddInLoop);                                  \
+  }                                                                            \
+  BENCHMARK(benchForTruncOrZextVecWithAddInLoopFrom_##ty1##_To_##ty2##_);
 
 /* Vectorized truncate operations */
-ADD_BENCHMARK(uint64_t, uint8_t)
-ADD_BENCHMARK(uint32_t, uint8_t)
 ADD_BENCHMARK(uint16_t, uint8_t)
-
+ADD_BENCHMARK(uint32_t, uint8_t)
+ADD_BENCHMARK(uint64_t, uint8_t)
+ADD_BENCHMARK(uint32_t, uint16_t)
+ADD_BENCHMARK(uint64_t, uint16_t)
+ADD_BENCHMARK(uint64_t, uint32_t)
 
 /* Vectorized zero extend operations */
+ADD_BENCHMARK(uint8_t, uint16_t)
 ADD_BENCHMARK(uint8_t, uint32_t)
+ADD_BENCHMARK(uint8_t, uint64_t)
+ADD_BENCHMARK(uint16_t, uint32_t)
+ADD_BENCHMARK(uint16_t, uint64_t)
+ADD_BENCHMARK(uint32_t, uint64_t)
@@ -8,7 +8,9 @@
  * Portions from Vorbize, (c) Kenneth Arnold <[email protected]>
  * and libvorbis examples, (c) Monty <[email protected]>
  */
+#if __has_include(<alloca.h>)
 #include <alloca.h>
+#endif
 #include <assert.h>
 #include <ctype.h>
 #include <errno.h>
 
@@ -4,7 +4,9 @@
 #include <unistd.h>
 #include <stdlib.h>
 #include <stdio.h>
+#if __has_include(<alloca.h>)
 #include <alloca.h>
+#endif
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <arpa/inet.h>
 
@@ -3760,14 +3760,14 @@ int s318( int inc)
 	for (int nl = 0; nl < ntimes/2; nl++) {
 		k = 0;
 		index = 0;
-		max = abs(a[0]);
+		max = FABS(a[0]);
 		k += inc;
 		for (int i = 1; i < LEN; i++) {
-			if (abs(a[k]) <= max) {
+			if (FABS(a[k]) <= max) {
 				goto L5;
 			}
 			index = i;
-			max = abs(a[k]);
+			max = FABS(a[k]);
 L5:
 			k += inc;
 		}
@@ -3971,10 +3971,10 @@ int s3113()
 
 	TYPE max;
 	for (int nl = 0; nl < ntimes*4; nl++) {
-		max = abs(a[0]);
+		max = FABS(a[0]);
 		for (int i = 0; i < LEN; i++) {
-			if ((abs(a[i])) > max) {
-				max = abs(a[i]);
+			if ((FABS(a[i])) > max) {
+				max = FABS(a[i]);
 			}
 		}
 		dummy(a, b, c, d, e, aa, bb, cc, max);
 
@@ -5,11 +5,14 @@
 #define LEN2 256
 
 #ifndef TYPE
-#define TYPE float
+  #define TYPE float
+  #define FABS(x) fabsf(x)
+#else
+  #define FABS(x) fabs(x)
 #endif
 
 #ifndef X_TYPE
-#define X_TYPE TYPE
+  #define X_TYPE TYPE
 #endif
 
 #ifndef ALIGNMENT
 
@@ -1,12 +1,16 @@
 // REQUIRES: linux
+
+// https://github.com/intel/llvm/issues/7634
+// UNSUPPORTED: hip
+
 // RUN: %clangxx -DSYCL_FALLBACK_ASSERT=1 -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out &> %t.txt || true
-// RUN: %CPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.txt
-// RUN: %GPU_RUN_PLACEHOLDER %t.out &> %t.txt || true
-// RUN: %GPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.txt
+// RUN: %CPU_RUN_PLACEHOLDER %t.out &> %t.cpu.txt || true
+// RUN: %CPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.cpu.txt
+// RUN: %GPU_RUN_PLACEHOLDER %t.out &> %t.gpu.txt || true
+// RUN: %GPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.gpu.txt
 // Shouldn't fail on ACC as fallback assert isn't enqueued there
-// RUN: %ACC_RUN_PLACEHOLDER %t.out &> %t.txt
-// RUN: %ACC_RUN_PLACEHOLDER FileCheck %s --check-prefix=CHECK-ACC --input-file %t.txt
+// RUN: %ACC_RUN_PLACEHOLDER %t.out &> %t.acc.txt
+// RUN: %ACC_RUN_PLACEHOLDER FileCheck %s --check-prefix=CHECK-ACC --input-file %t.acc.txt
 //
 // CHECK-NOT:  One shouldn't see this message
 // CHECK:      {{.*}}assert_in_kernels.hpp:25: void kernelFunc2(int *, int): {{.*}} [{{[0,2]}},0,0], {{.*}} [0,0,0]
 
@@ -1,12 +1,16 @@
 // REQUIRES: linux
+
+// https://github.com/intel/llvm/issues/7634
+// UNSUPPORTED: hip
+
 // RUN: %clangxx -DSYCL_FALLBACK_ASSERT=1 -fsycl -fsycl-targets=%sycl_triple -I %S/Inputs %s %S/Inputs/kernels_in_file2.cpp -o %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out &> %t.txt || true
-// RUN: %CPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.txt
-// RUN: %GPU_RUN_PLACEHOLDER %t.out &> %t.txt || true
-// RUN: %GPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.txt
+// RUN: %CPU_RUN_PLACEHOLDER %t.out &> %t.cpu.txt || true
+// RUN: %CPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.cpu.txt
+// RUN: %GPU_RUN_PLACEHOLDER %t.out &> %t.gpu.txt || true
+// RUN: %GPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.gpu.txt
 // Shouldn't fail on ACC as fallback assert isn't enqueued there
-// RUN: %ACC_RUN_PLACEHOLDER %t.out &> %t.txt
-// RUN: %ACC_RUN_PLACEHOLDER FileCheck %s --check-prefix=CHECK-ACC --input-file %t.txt
+// RUN: %ACC_RUN_PLACEHOLDER %t.out &> %t.acc.txt
+// RUN: %ACC_RUN_PLACEHOLDER FileCheck %s --check-prefix=CHECK-ACC --input-file %t.acc.txt
 //
 // CUDA uses block/thread vs global/local id for SYCL, also it shows the
 // position of a thread within the block, not the absolute ID.
 
@@ -1,12 +1,16 @@
 // REQUIRES: linux
+
+// https://github.com/intel/llvm/issues/7634
+// UNSUPPORTED: hip
+
 // RUN: %clangxx -DSYCL_FALLBACK_ASSERT=1 -fsycl -fsycl-targets=%sycl_triple -DDEFINE_NDEBUG_INFILE2 -I %S/Inputs %S/assert_in_multiple_tus.cpp %S/Inputs/kernels_in_file2.cpp -o %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out &> %t.txt || true
-// RUN: %CPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.txt
-// RUN: %GPU_RUN_PLACEHOLDER %t.out &> %t.txt || true
-// RUN: %GPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.txt
+// RUN: %CPU_RUN_PLACEHOLDER %t.out &> %t.cpu.txt || true
+// RUN: %CPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.cpu.txt
+// RUN: %GPU_RUN_PLACEHOLDER %t.out &> %t.gpu.txt || true
+// RUN: %GPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.gpu.txt
 // Shouldn't fail on ACC as fallback assert isn't enqueued there
-// RUN: %ACC_RUN_PLACEHOLDER %t.out &> %t.txt
-// RUN: %ACC_RUN_PLACEHOLDER FileCheck %s --check-prefix=CHECK-ACC --input-file %t.txt
+// RUN: %ACC_RUN_PLACEHOLDER %t.out &> %t.acc.txt
+// RUN: %ACC_RUN_PLACEHOLDER FileCheck %s --check-prefix=CHECK-ACC --input-file %t.acc.txt
 //
 // CHECK-NOT:  this message from calculus
 // CUDA uses block/thread vs global/local id for SYCL, also it shows the
 
@@ -1,12 +1,16 @@
 // REQUIRES: linux
+
+// https://github.com/intel/llvm/issues/7634
+// UNSUPPORTED: hip
+
 // RUN: %clangxx -DSYCL_FALLBACK_ASSERT=1 -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out &> %t.txt || true
-// RUN: %CPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.txt
-// RUN: %GPU_RUN_PLACEHOLDER %t.out &> %t.txt || true
-// RUN: %GPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.txt
+// RUN: %CPU_RUN_PLACEHOLDER %t.out &> %t.cpu.txt || true
+// RUN: %CPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.cpu.txt
+// RUN: %GPU_RUN_PLACEHOLDER %t.out &> %t.gpu.txt || true
+// RUN: %GPU_RUN_PLACEHOLDER FileCheck %s --input-file %t.gpu.txt
 // Shouldn't fail on ACC as fallback assert isn't enqueued there
-// RUN: %ACC_RUN_PLACEHOLDER %t.out &> %t.txt
-// RUN: %ACC_RUN_PLACEHOLDER FileCheck %s --check-prefix=CHECK-ACC --input-file %t.txt
+// RUN: %ACC_RUN_PLACEHOLDER %t.out &> %t.acc.txt
+// RUN: %ACC_RUN_PLACEHOLDER FileCheck %s --check-prefix=CHECK-ACC --input-file %t.acc.txt
 //
 // CHECK:      {{.*}}assert_in_one_kernel.hpp:10: void kernelFunc(int *, int): {{.*}} [{{[0-3]}},0,0], {{.*}} [0,0,0]
 // CHECK-SAME: Assertion `Buf[wiID] != 0 && "from assert statement"` failed.
Original file line number	Diff line number	Diff line change
`@@ -181,7 +181,7 @@ foreach (_file IN LISTS ldecod_SourceNames)`
`181`	`181`	`endforeach ()`
`182`	`182`	`llvm_add_host_executable(ldecod_${SUFFIX}-host`
`183`	`183`	`ldecod_${SUFFIX} ${ldecod_Sources}`
`184`		`- CPPFLAGS -I "${SRC_DIR}/ldecod_src/inc" -DSPEC`
	`184`	`+ CPPFLAGS -I "${SRC_DIR}/ldecod_src/inc" -DSPEC -fcommon`
`185`	`185`	`LDFLAGS -lm`
`186`	`186`	`)`
`187`	`187`