|
| 1 | +#include <CL/sycl.hpp> |
| 2 | +#include <sycl/ext/intel/esimd.hpp> |
| 3 | + |
| 4 | +#include <iostream> |
| 5 | + |
| 6 | +using namespace cl::sycl; |
| 7 | +using namespace sycl::ext::intel::esimd; |
| 8 | +using namespace sycl::ext::intel::experimental::esimd; |
| 9 | + |
| 10 | +template <int CaseNum, typename T, uint32_t Groups, uint32_t LocalRange, |
| 11 | + uint16_t VL, uint16_t NChannels, bool Transpose, |
| 12 | + lsc_data_size DS = lsc_data_size::default_size> |
| 13 | +bool test(uint32_t PMask = ~0) { |
| 14 | + static_assert((NChannels == 1) || !Transpose, |
| 15 | + "Transpose must have exec size 1"); |
| 16 | + if constexpr (DS == lsc_data_size::u8u32 || DS == lsc_data_size::u16u32) { |
| 17 | + static_assert(!Transpose, "Conversion types may not use vector"); |
| 18 | + static_assert(NChannels == 1, "Only D32 and D64 support vector load"); |
| 19 | + } |
| 20 | + |
| 21 | + static_assert(DS != lsc_data_size::u16u32h, "D16U32h not supported in HW"); |
| 22 | + static_assert(sizeof(T) >= 4, |
| 23 | + "D8 and D16 are valid only in 2D block load/store"); |
| 24 | + |
| 25 | + if constexpr (!Transpose && NChannels > 1) { |
| 26 | + static_assert(VL == 16 || VL == 32, |
| 27 | + "IGC prohibits execution size less than SIMD size when " |
| 28 | + "vector size is greater than 1"); |
| 29 | + } |
| 30 | + |
| 31 | + T VMask = static_cast<T>(-1); |
| 32 | + if constexpr (DS == lsc_data_size::u8u32) |
| 33 | + VMask = static_cast<T>(0xff); |
| 34 | + else if constexpr (DS == lsc_data_size::u16u32) |
| 35 | + VMask = static_cast<T>(0xffff); |
| 36 | + else if constexpr (DS == lsc_data_size::u16u32h) |
| 37 | + VMask = static_cast<T>(0xffff0000); |
| 38 | + |
| 39 | + queue Q(gpu_selector{}); |
| 40 | + auto D = Q.get_device(); |
| 41 | + std::cout << "Running case #" << CaseNum << " on " |
| 42 | + << D.get_info<info::device::name>() << std::endl; |
| 43 | + |
| 44 | + nd_range<1> Range{range<1>{Groups * LocalRange}, range<1>{LocalRange}}; |
| 45 | + constexpr uint16_t OutSize = Groups * LocalRange * VL * NChannels; |
| 46 | + T *Out = malloc_shared<T>(OutSize, Q); |
| 47 | + memset(Out, 0, OutSize * sizeof(T)); |
| 48 | + |
| 49 | + try { |
| 50 | + Q.submit([&](handler &cgh) { |
| 51 | + cgh.parallel_for(Range, [=](sycl::nd_item<1> NDId) SYCL_ESIMD_KERNEL { |
| 52 | + uint32_t GID = NDId.get_global_id(0); |
| 53 | + uint32_t LID = NDId.get_local_id(0); |
| 54 | + uint32_t GroupID = NDId.get_group_linear_id(); |
| 55 | + |
| 56 | + // 1. Allocate and init 128-byte multiple size SLM memory with special |
| 57 | + // values. |
| 58 | + constexpr uint32_t ResultSIMDByteSize = VL * NChannels * sizeof(T); |
| 59 | + constexpr uint32_t SLMSize = |
| 60 | + (ResultSIMDByteSize * LocalRange + 127) & ~127; |
| 61 | + slm_init(SLMSize); |
| 62 | + if (NDId.get_local_id(0) == 0) { |
| 63 | + simd<T, 4> Vals = static_cast<T>(0xBAADF00DBAADF00D); |
| 64 | + for (int I = 0; I < SLMSize; I += 4 * sizeof(T)) |
| 65 | + slm_block_store<T, 4>(I, Vals); |
| 66 | + } |
| 67 | + barrier(); |
| 68 | + |
| 69 | + // 2. Use STORE intrinscis that are being verified in this test. |
| 70 | + if constexpr (Transpose) { |
| 71 | + simd<T, VL> Vals(GroupID * 1000000 + LID * 1000, 1); |
| 72 | + lsc_slm_block_store<T, VL, DS>(LID * VL * sizeof(T), Vals); |
| 73 | + } else { |
| 74 | + |
| 75 | + // Create the predicate for the gather from 'PMask'. |
| 76 | + simd_mask<VL> Pred; |
| 77 | + for (int I = 0; I < VL; I++) |
| 78 | + Pred.template select<1, 1>(I) = (PMask >> I) & 1; |
| 79 | + |
| 80 | + simd<T, VL * NChannels> Vals(GroupID * 1000000 + LID * 1000, 1); |
| 81 | + simd<uint32_t, VL> Offsets(LID * VL * NChannels * sizeof(T), |
| 82 | + NChannels * sizeof(T)); |
| 83 | + lsc_slm_scatter<T, NChannels, DS>(Offsets, Vals, Pred); |
| 84 | + } |
| 85 | + barrier(); |
| 86 | + |
| 87 | + // 3. Simply load the content of SLM and store it to USM. |
| 88 | + if (NDId.get_local_id(0) == 0) { |
| 89 | + int End = LocalRange * VL * NChannels; |
| 90 | + for (int I = 0; I < End; I += 4) { |
| 91 | + auto Vals = slm_block_load<T, 4>(I * sizeof(T)); |
| 92 | + |
| 93 | + // If 'VL' is small, simd<T, 4> cannot be safely used |
| 94 | + if (I + 4 > End) { |
| 95 | + for (int J = 0; J + I < End; J++) |
| 96 | + Out[GroupID * LocalRange * VL * NChannels + I + J] = |
| 97 | + (T)Vals[J]; |
| 98 | + } else { |
| 99 | + Vals.copy_to(Out + GroupID * LocalRange * VL * NChannels + I); |
| 100 | + } |
| 101 | + } |
| 102 | + } |
| 103 | + }); |
| 104 | + }).wait(); |
| 105 | + } catch (sycl::exception const &e) { |
| 106 | + std::cout << "SYCL exception caught: " << e.what() << '\n'; |
| 107 | + sycl::free(Out, Q); |
| 108 | + return false; |
| 109 | + } |
| 110 | + |
| 111 | + bool Passed = true; |
| 112 | + |
| 113 | + if constexpr (Transpose) { |
| 114 | + for (uint32_t I = 0; I < OutSize; I++) { |
| 115 | + uint32_t GroupId = I / (LocalRange * VL); |
| 116 | + uint32_t LID = I / VL % LocalRange; |
| 117 | + T ExpectedVal = GroupId * 1000000 + LID * 1000 + I % VL; |
| 118 | + if (Out[I] != ExpectedVal) { |
| 119 | + Passed = false; |
| 120 | + std::cout << I << ": Value = " << Out[I] |
| 121 | + << ", Expected value = " << ExpectedVal << std::endl; |
| 122 | + } |
| 123 | + } |
| 124 | + } else { |
| 125 | + for (uint32_t I = 0; I < OutSize; I += VL * NChannels) { |
| 126 | + uint32_t GroupId = I / (LocalRange * VL * NChannels); |
| 127 | + uint32_t LID = I / (VL * NChannels) % LocalRange; |
| 128 | + T ExpectedValBase = GroupId * 1000000 + LID * 1000 + I % (VL * NChannels); |
| 129 | + T ExpectedValInc = 0; |
| 130 | + uint32_t MaskIndex = 0; |
| 131 | + uint32_t MaskIndexTimer = 0; |
| 132 | + for (int ChannelId = 0; ChannelId < NChannels; ChannelId++) { |
| 133 | + for (int J = 0; J < VL; J++) { |
| 134 | + uint32_t OutIndex = I + ChannelId * VL + J; |
| 135 | + T ExpectedVal = ((PMask >> MaskIndex) & 1) |
| 136 | + ? (ExpectedValBase + ExpectedValInc) |
| 137 | + : static_cast<T>(0xBAADF00DBAADF00D); |
| 138 | + ExpectedVal &= VMask; |
| 139 | + MaskIndexTimer++; |
| 140 | + if (MaskIndexTimer >= NChannels) { |
| 141 | + MaskIndexTimer = 0; |
| 142 | + MaskIndex++; |
| 143 | + } |
| 144 | + |
| 145 | + ExpectedValInc += VL; |
| 146 | + if (ExpectedValInc >= VL * NChannels) |
| 147 | + ExpectedValInc = (ExpectedValInc % (VL * NChannels)) + 1; |
| 148 | + |
| 149 | + T OutVal = Out[OutIndex] & VMask; |
| 150 | + if (OutVal != ExpectedVal) { |
| 151 | + Passed = false; |
| 152 | + std::cout << OutIndex << ": Value = " << Out[OutIndex] |
| 153 | + << ", Expected value = " << ExpectedVal << std::endl; |
| 154 | + } |
| 155 | + } |
| 156 | + } |
| 157 | + } |
| 158 | + } |
| 159 | + |
| 160 | + sycl::free(Out, Q); |
| 161 | + |
| 162 | + if (!Passed) |
| 163 | + std::cout << "Case #" << CaseNum << " FAILED" << std::endl; |
| 164 | + return Passed; |
| 165 | +} |
0 commit comments