Skip to content

Commit 896749a

Browse files
[amdgpu][openmp] Avoiding writing to packet header twice (#70695)
I think it follows from the HSA spec that a write to the first byte is deemed significant to the GPU in which case writing to the second short and reading back from it later would be safe. However, the examples for this all involve an atomic write to the first 32 bits and it seems a credible risk that the occasional CI errors abound invalid packets have as their root cause that the firmware notices the early write to packet->setup and treats that as a sign that the packet is ready to go. That was overly-paranoid, however in passing noticed the code in libc is genuinely invalid. The memset writes a zero to the header byte, changing it from type_invalid (1) to type_vendor (0), at which point the GPU is free to read the 64 byte packet and interpret it as a vendor packet, which is probably why libc CI periodically errors about invalid packets. Also a drive by change to do the atomic store on a uint32_t consistently. I'm not sure offhand what __atomic_store_n on a uint16_t* and an int resolves to, seems better to be unambiguous there.
1 parent 9fe5700 commit 896749a

File tree

2 files changed

+16
-15
lines changed
  • libc/utils/gpu/loader/amdgpu
  • openmp/libomptarget/plugins-nextgen/amdgpu/src

2 files changed

+16
-15
lines changed

libc/utils/gpu/loader/amdgpu/Loader.cpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -222,34 +222,35 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
222222
// Set up the packet for exeuction on the device. We currently only launch
223223
// with one thread on the device, forcing the rest of the wavefront to be
224224
// masked off.
225-
std::memset(packet, 0, sizeof(hsa_kernel_dispatch_packet_t));
226-
packet->setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) +
227-
(params.num_blocks_z * params.num_threads_z != 1))
228-
<< HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
225+
uint16_t setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) +
226+
(params.num_blocks_z * params.num_threads_z != 1))
227+
<< HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
229228
packet->workgroup_size_x = params.num_threads_x;
230229
packet->workgroup_size_y = params.num_threads_y;
231230
packet->workgroup_size_z = params.num_threads_z;
231+
packet->reserved0 = 0;
232232
packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
233233
packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
234234
packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
235235
packet->private_segment_size = private_size;
236236
packet->group_segment_size = group_size;
237237
packet->kernel_object = kernel;
238238
packet->kernarg_address = args;
239-
239+
packet->reserved2 = 0;
240240
// Create a signal to indicate when this packet has been completed.
241241
if (hsa_status_t err =
242242
hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
243243
handle_error(err);
244244

245245
// Initialize the packet header and set the doorbell signal to begin execution
246246
// by the HSA runtime.
247-
uint16_t setup = packet->setup;
248247
uint16_t header =
249248
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
250249
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
251250
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
252-
__atomic_store_n(&packet->header, header | (setup << 16), __ATOMIC_RELEASE);
251+
uint32_t header_word =
252+
header | (setup << 16u) __atomic_store_n((uint32_t *)&packet->header,
253+
header_word, __ATOMIC_RELEASE);
253254
hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
254255

255256
// Wait until the kernel has completed execution on the device. Periodically

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -649,8 +649,8 @@ struct AMDGPUQueueTy {
649649
hsa_kernel_dispatch_packet_t *Packet = acquirePacket(PacketId);
650650
assert(Packet && "Invalid packet");
651651

652-
// The header of the packet is written in the last moment.
653-
Packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
652+
// The first 32 bits of the packet are written after the other fields
653+
uint16_t Setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
654654
Packet->workgroup_size_x = NumThreads;
655655
Packet->workgroup_size_y = 1;
656656
Packet->workgroup_size_z = 1;
@@ -666,7 +666,7 @@ struct AMDGPUQueueTy {
666666
Packet->completion_signal = OutputSignal->get();
667667

668668
// Publish the packet. Do not modify the packet after this point.
669-
publishKernelPacket(PacketId, Packet);
669+
publishKernelPacket(PacketId, Setup, Packet);
670670

671671
return Plugin::success();
672672
}
@@ -743,17 +743,17 @@ struct AMDGPUQueueTy {
743743
/// Publish the kernel packet so that the HSA runtime can start processing
744744
/// the kernel launch. Do not modify the packet once this function is called.
745745
/// Assumes the queue lock is acquired.
746-
void publishKernelPacket(uint64_t PacketId,
746+
void publishKernelPacket(uint64_t PacketId, uint16_t Setup,
747747
hsa_kernel_dispatch_packet_t *Packet) {
748748
uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
749749

750-
uint16_t Setup = Packet->setup;
751750
uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
752751
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
753752
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
754753

755754
// Publish the packet. Do not modify the package after this point.
756-
__atomic_store_n(PacketPtr, Header | (Setup << 16), __ATOMIC_RELEASE);
755+
uint32_t HeaderWord = Header | (Setup << 16u);
756+
__atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
757757

758758
// Signal the doorbell about the published packet.
759759
hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
@@ -765,14 +765,14 @@ struct AMDGPUQueueTy {
765765
void publishBarrierPacket(uint64_t PacketId,
766766
hsa_barrier_and_packet_t *Packet) {
767767
uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
768-
769768
uint16_t Setup = 0;
770769
uint16_t Header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
771770
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
772771
Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
773772

774773
// Publish the packet. Do not modify the package after this point.
775-
__atomic_store_n(PacketPtr, Header | (Setup << 16), __ATOMIC_RELEASE);
774+
uint32_t HeaderWord = Header | (Setup << 16u);
775+
__atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
776776

777777
// Signal the doorbell about the published packet.
778778
hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);

0 commit comments

Comments
 (0)