You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
[GVN] Load-store forwaring of scalable store to fixed load. (#124748)
When storing a scalable vector and loading a fixed-size vector, where
the
scalable vector is known to be larger based on vscale_range, perform
store-to-load forwarding through temporary @llvm.vector.extract calls.
InstCombine then folds the insert/extract pair away.
The usecase is shown in https://godbolt.org/z/KT3sMrMbd, which shows
that clang generates IR that matches this pattern when the
"arm_sve_vector_bits" attribute is used:
```c
typedef svfloat32_t svfloat32_fixed_t
__attribute__((arm_sve_vector_bits(512)));
struct svfloat32_wrapped_t {
svfloat32_fixed_t v;
};
static inline svfloat32_wrapped_t
add(svfloat32_wrapped_t a, svfloat32_wrapped_t b) {
return {svadd_f32_x(svptrue_b32(), a.v, b.v)};
}
svfloat32_wrapped_t
foo(svfloat32_wrapped_t a, svfloat32_wrapped_t b) {
// The IR pattern this patch matches is generated for this return:
return add(a, b);
}
```
; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> [[TMP0]], i64 0)
686
+
; CHECK-NEXT: ret <vscale x 4 x float> [[CAST_SCALABLE]]
687
+
;
688
+
entry:
689
+
%ptr = alloca { <32 x float> }
690
+
store <vscale x 4 x float> %a, ptr%ptr
691
+
%gep = getelementptrinboundsi8, ptr%ptr, i648
692
+
%1 = load <16 x float>, ptr%gep
693
+
%cast.scalable = tailcall <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> %1, i640)
694
+
ret <vscale x 4 x float> %cast.scalable
695
+
}
696
+
697
+
define <vscale x 4 x float> @scalable_store_to_fixed_load_unknown_vscale(<vscale x 4 x float> %.coerce) {
; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> [[TMP1]], i64 0)
705
+
; CHECK-NEXT: ret <vscale x 4 x float> [[CAST_SCALABLE]]
706
+
;
707
+
entry:
708
+
%retval = alloca { <16 x float> }
709
+
%0 = fadd <vscale x 4 x float> %.coerce, %.coerce
710
+
store <vscale x 4 x float> %0, ptr%retval
711
+
%1 = load <16 x float>, ptr%retval
712
+
%cast.scalable = tailcall <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> %1, i640)
713
+
ret <vscale x 4 x float> %cast.scalable
714
+
}
715
+
716
+
define <vscale x 4 x float> @scalable_store_to_fixed_load_size_missmatch(<vscale x 4 x float> %.coerce) vscale_range(4,4) {
; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v32f32(<vscale x 4 x float> poison, <32 x float> [[TMP1]], i64 0)
724
+
; CHECK-NEXT: ret <vscale x 4 x float> [[CAST_SCALABLE]]
725
+
;
726
+
entry:
727
+
%retval = alloca { <32 x float> }
728
+
%0 = fadd <vscale x 4 x float> %.coerce, %.coerce
729
+
store <vscale x 4 x float> %0, ptr%retval
730
+
%1 = load <32 x float>, ptr%retval
731
+
%cast.scalable = tailcall <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v32f32(<vscale x 4 x float> poison, <32 x float> %1, i640)
732
+
ret <vscale x 4 x float> %cast.scalable
733
+
}
734
+
735
+
define <vscale x 4 x i32> @scalable_store_to_fixed_load_different_types(<vscale x 4 x float> %a) vscale_range(4,4) {
; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> poison, <16 x i32> [[TMP0]], i64 0)
742
+
; CHECK-NEXT: ret <vscale x 4 x i32> [[CAST_SCALABLE]]
743
+
;
744
+
entry:
745
+
%ptr = alloca { <16 x float> }
746
+
store <vscale x 4 x float> %a, ptr%ptr
747
+
%1 = load <16 x i32>, ptr%ptr
748
+
%cast.scalable = tailcall <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> poison, <16 x i32> %1, i640)
749
+
ret <vscale x 4 x i32> %cast.scalable
750
+
}
751
+
752
+
; This function does not have a fixed vscale, but the loaded vector is still known
753
+
; to be smaller or equal in size compared to the stored vector.
754
+
define <4 x float> @scalable_store_to_small_fixed_load(<vscale x 4 x float> %a) {
0 commit comments