@@ -6741,6 +6741,81 @@ entry:
6741
6741
ret void
6742
6742
}
6743
6743
6744
+ define amdgpu_kernel void @atomic_store_bf16_offset (bfloat %in , ptr %out ) {
6745
+ ; GCN1-LABEL: atomic_store_bf16_offset:
6746
+ ; GCN1: ; %bb.0:
6747
+ ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
6748
+ ; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9
6749
+ ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6750
+ ; GCN1-NEXT: v_mov_b32_e32 v0, s2
6751
+ ; GCN1-NEXT: v_mov_b32_e32 v1, s3
6752
+ ; GCN1-NEXT: v_mov_b32_e32 v2, s0
6753
+ ; GCN1-NEXT: flat_store_short v[0:1], v2
6754
+ ; GCN1-NEXT: s_endpgm
6755
+ ;
6756
+ ; GCN2-LABEL: atomic_store_bf16_offset:
6757
+ ; GCN2: ; %bb.0:
6758
+ ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6759
+ ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24
6760
+ ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6761
+ ; GCN2-NEXT: v_mov_b32_e32 v0, s2
6762
+ ; GCN2-NEXT: v_mov_b32_e32 v1, s3
6763
+ ; GCN2-NEXT: v_mov_b32_e32 v2, s0
6764
+ ; GCN2-NEXT: flat_store_short v[0:1], v2
6765
+ ; GCN2-NEXT: s_endpgm
6766
+ ;
6767
+ ; GCN3-LABEL: atomic_store_bf16_offset:
6768
+ ; GCN3: ; %bb.0:
6769
+ ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6770
+ ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24
6771
+ ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6772
+ ; GCN3-NEXT: v_mov_b32_e32 v0, s2
6773
+ ; GCN3-NEXT: v_mov_b32_e32 v1, s3
6774
+ ; GCN3-NEXT: v_mov_b32_e32 v2, s4
6775
+ ; GCN3-NEXT: flat_store_short v[0:1], v2
6776
+ ; GCN3-NEXT: s_endpgm
6777
+ %gep = getelementptr bfloat, ptr %out , i64 8
6778
+ store atomic bfloat %in , ptr %out seq_cst , align 2
6779
+ ret void
6780
+ }
6781
+
6782
+ define amdgpu_kernel void @atomic_store_bf16 (bfloat %in , ptr %out ) {
6783
+ ; GCN1-LABEL: atomic_store_bf16:
6784
+ ; GCN1: ; %bb.0:
6785
+ ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
6786
+ ; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9
6787
+ ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6788
+ ; GCN1-NEXT: v_mov_b32_e32 v0, s2
6789
+ ; GCN1-NEXT: v_mov_b32_e32 v1, s3
6790
+ ; GCN1-NEXT: v_mov_b32_e32 v2, s0
6791
+ ; GCN1-NEXT: flat_store_short v[0:1], v2
6792
+ ; GCN1-NEXT: s_endpgm
6793
+ ;
6794
+ ; GCN2-LABEL: atomic_store_bf16:
6795
+ ; GCN2: ; %bb.0:
6796
+ ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6797
+ ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24
6798
+ ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6799
+ ; GCN2-NEXT: v_mov_b32_e32 v0, s2
6800
+ ; GCN2-NEXT: v_mov_b32_e32 v1, s3
6801
+ ; GCN2-NEXT: v_mov_b32_e32 v2, s0
6802
+ ; GCN2-NEXT: flat_store_short v[0:1], v2
6803
+ ; GCN2-NEXT: s_endpgm
6804
+ ;
6805
+ ; GCN3-LABEL: atomic_store_bf16:
6806
+ ; GCN3: ; %bb.0:
6807
+ ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6808
+ ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24
6809
+ ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6810
+ ; GCN3-NEXT: v_mov_b32_e32 v0, s2
6811
+ ; GCN3-NEXT: v_mov_b32_e32 v1, s3
6812
+ ; GCN3-NEXT: v_mov_b32_e32 v2, s4
6813
+ ; GCN3-NEXT: flat_store_short v[0:1], v2
6814
+ ; GCN3-NEXT: s_endpgm
6815
+ store atomic bfloat %in , ptr %out seq_cst , align 2
6816
+ ret void
6817
+ }
6818
+
6744
6819
define amdgpu_kernel void @atomic_inc_i32_offset (ptr %out , i32 %in ) {
6745
6820
; GCN1-LABEL: atomic_inc_i32_offset:
6746
6821
; GCN1: ; %bb.0: ; %entry
@@ -7868,3 +7943,201 @@ entry:
7868
7943
store i32 %val , ptr %out2
7869
7944
ret void
7870
7945
}
7946
+
7947
+ define amdgpu_kernel void @atomic_load_f16_offset (ptr %in , ptr %out ) {
7948
+ ; GCN1-LABEL: atomic_load_f16_offset:
7949
+ ; GCN1: ; %bb.0:
7950
+ ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
7951
+ ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7952
+ ; GCN1-NEXT: s_add_u32 s0, s0, 16
7953
+ ; GCN1-NEXT: s_addc_u32 s1, s1, 0
7954
+ ; GCN1-NEXT: v_mov_b32_e32 v0, s0
7955
+ ; GCN1-NEXT: v_mov_b32_e32 v1, s1
7956
+ ; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
7957
+ ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7958
+ ; GCN1-NEXT: buffer_wbinvl1_vol
7959
+ ; GCN1-NEXT: v_mov_b32_e32 v0, s2
7960
+ ; GCN1-NEXT: v_mov_b32_e32 v1, s3
7961
+ ; GCN1-NEXT: flat_store_short v[0:1], v2
7962
+ ; GCN1-NEXT: s_endpgm
7963
+ ;
7964
+ ; GCN2-LABEL: atomic_load_f16_offset:
7965
+ ; GCN2: ; %bb.0:
7966
+ ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
7967
+ ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7968
+ ; GCN2-NEXT: s_add_u32 s0, s0, 16
7969
+ ; GCN2-NEXT: s_addc_u32 s1, s1, 0
7970
+ ; GCN2-NEXT: v_mov_b32_e32 v0, s0
7971
+ ; GCN2-NEXT: v_mov_b32_e32 v1, s1
7972
+ ; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
7973
+ ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7974
+ ; GCN2-NEXT: buffer_wbinvl1_vol
7975
+ ; GCN2-NEXT: v_mov_b32_e32 v0, s2
7976
+ ; GCN2-NEXT: v_mov_b32_e32 v1, s3
7977
+ ; GCN2-NEXT: flat_store_short v[0:1], v2
7978
+ ; GCN2-NEXT: s_endpgm
7979
+ ;
7980
+ ; GCN3-LABEL: atomic_load_f16_offset:
7981
+ ; GCN3: ; %bb.0:
7982
+ ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
7983
+ ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
7984
+ ; GCN3-NEXT: v_mov_b32_e32 v0, s0
7985
+ ; GCN3-NEXT: v_mov_b32_e32 v1, s1
7986
+ ; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
7987
+ ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7988
+ ; GCN3-NEXT: buffer_wbinvl1_vol
7989
+ ; GCN3-NEXT: v_mov_b32_e32 v0, s2
7990
+ ; GCN3-NEXT: v_mov_b32_e32 v1, s3
7991
+ ; GCN3-NEXT: flat_store_short v[0:1], v2
7992
+ ; GCN3-NEXT: s_endpgm
7993
+ %gep = getelementptr half , ptr %in , i64 8
7994
+ %val = load atomic half , ptr %gep seq_cst , align 2
7995
+ store half %val , ptr %out
7996
+ ret void
7997
+ }
7998
+
7999
+ define amdgpu_kernel void @atomic_load_f16 (ptr %in , ptr %out ) {
8000
+ ; GCN1-LABEL: atomic_load_f16:
8001
+ ; GCN1: ; %bb.0:
8002
+ ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
8003
+ ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
8004
+ ; GCN1-NEXT: v_mov_b32_e32 v0, s0
8005
+ ; GCN1-NEXT: v_mov_b32_e32 v1, s1
8006
+ ; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
8007
+ ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8008
+ ; GCN1-NEXT: buffer_wbinvl1_vol
8009
+ ; GCN1-NEXT: v_mov_b32_e32 v0, s2
8010
+ ; GCN1-NEXT: v_mov_b32_e32 v1, s3
8011
+ ; GCN1-NEXT: flat_store_short v[0:1], v2
8012
+ ; GCN1-NEXT: s_endpgm
8013
+ ;
8014
+ ; GCN2-LABEL: atomic_load_f16:
8015
+ ; GCN2: ; %bb.0:
8016
+ ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
8017
+ ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
8018
+ ; GCN2-NEXT: v_mov_b32_e32 v0, s0
8019
+ ; GCN2-NEXT: v_mov_b32_e32 v1, s1
8020
+ ; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
8021
+ ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8022
+ ; GCN2-NEXT: buffer_wbinvl1_vol
8023
+ ; GCN2-NEXT: v_mov_b32_e32 v0, s2
8024
+ ; GCN2-NEXT: v_mov_b32_e32 v1, s3
8025
+ ; GCN2-NEXT: flat_store_short v[0:1], v2
8026
+ ; GCN2-NEXT: s_endpgm
8027
+ ;
8028
+ ; GCN3-LABEL: atomic_load_f16:
8029
+ ; GCN3: ; %bb.0:
8030
+ ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
8031
+ ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
8032
+ ; GCN3-NEXT: v_mov_b32_e32 v0, s0
8033
+ ; GCN3-NEXT: v_mov_b32_e32 v1, s1
8034
+ ; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc
8035
+ ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8036
+ ; GCN3-NEXT: buffer_wbinvl1_vol
8037
+ ; GCN3-NEXT: v_mov_b32_e32 v0, s2
8038
+ ; GCN3-NEXT: v_mov_b32_e32 v1, s3
8039
+ ; GCN3-NEXT: flat_store_short v[0:1], v2
8040
+ ; GCN3-NEXT: s_endpgm
8041
+ %val = load atomic half , ptr %in seq_cst , align 2
8042
+ store half %val , ptr %out
8043
+ ret void
8044
+ }
8045
+
8046
+ define amdgpu_kernel void @atomic_load_bf16_offset (ptr %in , ptr %out ) {
8047
+ ; GCN1-LABEL: atomic_load_bf16_offset:
8048
+ ; GCN1: ; %bb.0:
8049
+ ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
8050
+ ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
8051
+ ; GCN1-NEXT: s_add_u32 s0, s0, 16
8052
+ ; GCN1-NEXT: s_addc_u32 s1, s1, 0
8053
+ ; GCN1-NEXT: v_mov_b32_e32 v0, s0
8054
+ ; GCN1-NEXT: v_mov_b32_e32 v1, s1
8055
+ ; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
8056
+ ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8057
+ ; GCN1-NEXT: buffer_wbinvl1_vol
8058
+ ; GCN1-NEXT: v_mov_b32_e32 v0, s2
8059
+ ; GCN1-NEXT: v_mov_b32_e32 v1, s3
8060
+ ; GCN1-NEXT: flat_store_short v[0:1], v2
8061
+ ; GCN1-NEXT: s_endpgm
8062
+ ;
8063
+ ; GCN2-LABEL: atomic_load_bf16_offset:
8064
+ ; GCN2: ; %bb.0:
8065
+ ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
8066
+ ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
8067
+ ; GCN2-NEXT: s_add_u32 s0, s0, 16
8068
+ ; GCN2-NEXT: s_addc_u32 s1, s1, 0
8069
+ ; GCN2-NEXT: v_mov_b32_e32 v0, s0
8070
+ ; GCN2-NEXT: v_mov_b32_e32 v1, s1
8071
+ ; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
8072
+ ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8073
+ ; GCN2-NEXT: buffer_wbinvl1_vol
8074
+ ; GCN2-NEXT: v_mov_b32_e32 v0, s2
8075
+ ; GCN2-NEXT: v_mov_b32_e32 v1, s3
8076
+ ; GCN2-NEXT: flat_store_short v[0:1], v2
8077
+ ; GCN2-NEXT: s_endpgm
8078
+ ;
8079
+ ; GCN3-LABEL: atomic_load_bf16_offset:
8080
+ ; GCN3: ; %bb.0:
8081
+ ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
8082
+ ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
8083
+ ; GCN3-NEXT: v_mov_b32_e32 v0, s0
8084
+ ; GCN3-NEXT: v_mov_b32_e32 v1, s1
8085
+ ; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
8086
+ ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8087
+ ; GCN3-NEXT: buffer_wbinvl1_vol
8088
+ ; GCN3-NEXT: v_mov_b32_e32 v0, s2
8089
+ ; GCN3-NEXT: v_mov_b32_e32 v1, s3
8090
+ ; GCN3-NEXT: flat_store_short v[0:1], v2
8091
+ ; GCN3-NEXT: s_endpgm
8092
+ %gep = getelementptr bfloat, ptr %in , i64 8
8093
+ %val = load atomic bfloat, ptr %gep seq_cst , align 2
8094
+ store bfloat %val , ptr %out
8095
+ ret void
8096
+ }
8097
+
8098
+ define amdgpu_kernel void @atomic_load_bf16 (ptr %in , ptr %out ) {
8099
+ ; GCN1-LABEL: atomic_load_bf16:
8100
+ ; GCN1: ; %bb.0:
8101
+ ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
8102
+ ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
8103
+ ; GCN1-NEXT: v_mov_b32_e32 v0, s0
8104
+ ; GCN1-NEXT: v_mov_b32_e32 v1, s1
8105
+ ; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
8106
+ ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8107
+ ; GCN1-NEXT: buffer_wbinvl1_vol
8108
+ ; GCN1-NEXT: v_mov_b32_e32 v0, s2
8109
+ ; GCN1-NEXT: v_mov_b32_e32 v1, s3
8110
+ ; GCN1-NEXT: flat_store_short v[0:1], v2
8111
+ ; GCN1-NEXT: s_endpgm
8112
+ ;
8113
+ ; GCN2-LABEL: atomic_load_bf16:
8114
+ ; GCN2: ; %bb.0:
8115
+ ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
8116
+ ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
8117
+ ; GCN2-NEXT: v_mov_b32_e32 v0, s0
8118
+ ; GCN2-NEXT: v_mov_b32_e32 v1, s1
8119
+ ; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
8120
+ ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8121
+ ; GCN2-NEXT: buffer_wbinvl1_vol
8122
+ ; GCN2-NEXT: v_mov_b32_e32 v0, s2
8123
+ ; GCN2-NEXT: v_mov_b32_e32 v1, s3
8124
+ ; GCN2-NEXT: flat_store_short v[0:1], v2
8125
+ ; GCN2-NEXT: s_endpgm
8126
+ ;
8127
+ ; GCN3-LABEL: atomic_load_bf16:
8128
+ ; GCN3: ; %bb.0:
8129
+ ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
8130
+ ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
8131
+ ; GCN3-NEXT: v_mov_b32_e32 v0, s0
8132
+ ; GCN3-NEXT: v_mov_b32_e32 v1, s1
8133
+ ; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc
8134
+ ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8135
+ ; GCN3-NEXT: buffer_wbinvl1_vol
8136
+ ; GCN3-NEXT: v_mov_b32_e32 v0, s2
8137
+ ; GCN3-NEXT: v_mov_b32_e32 v1, s3
8138
+ ; GCN3-NEXT: flat_store_short v[0:1], v2
8139
+ ; GCN3-NEXT: s_endpgm
8140
+ %val = load atomic bfloat, ptr %in seq_cst , align 2
8141
+ store bfloat %val , ptr %out
8142
+ ret void
8143
+ }
0 commit comments