Skip to content

Commit cf17a24

Browse files
authored
[RISCV] Use subreg extract for extract_vector_elt when vlen is known (#72666)
This is the first in a planned patch series to teach our vector lowering how to exploit register boundaries in LMUL>1 types when VLEN is known to be an exact constant. This corresponds to code compiled by clang with the -mrvv-vector-bits=zvl option. For extract_vector_elt, if we have a constant index and a known vlen, then we can identify which register out of a register group is being accessed. Given this, we can do a sub-register extract for that register, and then shift any remaining index. This results in all constant index extracts becoming m1 operations, and thus eliminates the complexity concern for explode-vector idioms at high lmul.
1 parent 1944c4f commit cf17a24

File tree

4 files changed

+207
-4
lines changed

4 files changed

+207
-4
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7908,6 +7908,30 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
79087908
Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
79097909
}
79107910

7911+
// If we're compiling for an exact VLEN value and we have a known
7912+
// constant index, we can always perform the extract in m1 (or
7913+
// smaller) as we can determine the register corresponding to
7914+
// the index in the register group.
7915+
const unsigned MinVLen = Subtarget.getRealMinVLen();
7916+
const unsigned MaxVLen = Subtarget.getRealMaxVLen();
7917+
if (auto *IdxC = dyn_cast<ConstantSDNode>(Idx);
7918+
IdxC && MinVLen == MaxVLen &&
7919+
VecVT.getSizeInBits().getKnownMinValue() > MinVLen) {
7920+
MVT M1VT = getLMUL1VT(ContainerVT);
7921+
unsigned OrigIdx = IdxC->getZExtValue();
7922+
EVT ElemVT = VecVT.getVectorElementType();
7923+
unsigned ElemSize = ElemVT.getSizeInBits().getKnownMinValue();
7924+
unsigned ElemsPerVReg = MinVLen / ElemSize;
7925+
unsigned RemIdx = OrigIdx % ElemsPerVReg;
7926+
unsigned SubRegIdx = OrigIdx / ElemsPerVReg;
7927+
unsigned ExtractIdx =
7928+
SubRegIdx * M1VT.getVectorElementCount().getKnownMinValue();
7929+
Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, Vec,
7930+
DAG.getVectorIdxConstant(ExtractIdx, DL));
7931+
Idx = DAG.getVectorIdxConstant(RemIdx, DL);
7932+
ContainerVT = M1VT;
7933+
}
7934+
79117935
// Reduce the LMUL of our slidedown and vmv.x.s to the smallest LMUL which
79127936
// contains our index.
79137937
std::optional<uint64_t> MaxIdx;

llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -697,6 +697,27 @@ define i64 @extractelt_nxv8i64_imm(<vscale x 8 x i64> %v) {
697697
ret i64 %r
698698
}
699699

700+
define i64 @extractelt_nxv8i64_2_exact_vlen(<vscale x 8 x i64> %v) vscale_range(2,2) {
701+
; CHECK-LABEL: extractelt_nxv8i64_2_exact_vlen:
702+
; CHECK: # %bb.0:
703+
; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma
704+
; CHECK-NEXT: vmv.x.s a0, v9
705+
; CHECK-NEXT: ret
706+
%r = extractelement <vscale x 8 x i64> %v, i32 2
707+
ret i64 %r
708+
}
709+
710+
define i64 @extractelt_nxv8i64_15_exact_vlen(<vscale x 8 x i64> %v) vscale_range(2,2) {
711+
; CHECK-LABEL: extractelt_nxv8i64_15_exact_vlen:
712+
; CHECK: # %bb.0:
713+
; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma
714+
; CHECK-NEXT: vslidedown.vi v8, v15, 1
715+
; CHECK-NEXT: vmv.x.s a0, v8
716+
; CHECK-NEXT: ret
717+
%r = extractelement <vscale x 8 x i64> %v, i32 15
718+
ret i64 %r
719+
}
720+
700721
define i64 @extractelt_nxv8i64_idx(<vscale x 8 x i64> %v, i32 zeroext %idx) {
701722
; CHECK-LABEL: extractelt_nxv8i64_idx:
702723
; CHECK: # %bb.0:
@@ -860,10 +881,10 @@ define i64 @extractelt_nxv16i64_neg1(<vscale x 16 x i64> %v) {
860881
; CHECK-NEXT: slli a2, a2, 1
861882
; CHECK-NEXT: addi a2, a2, -1
862883
; CHECK-NEXT: vs8r.v v16, (a3)
863-
; CHECK-NEXT: bltu a2, a1, .LBB72_2
884+
; CHECK-NEXT: bltu a2, a1, .LBB74_2
864885
; CHECK-NEXT: # %bb.1:
865886
; CHECK-NEXT: mv a2, a1
866-
; CHECK-NEXT: .LBB72_2:
887+
; CHECK-NEXT: .LBB74_2:
867888
; CHECK-NEXT: slli a2, a2, 3
868889
; CHECK-NEXT: add a0, a0, a2
869890
; CHECK-NEXT: ld a0, 0(a0)
@@ -893,10 +914,10 @@ define i64 @extractelt_nxv16i64_idx(<vscale x 16 x i64> %v, i32 zeroext %idx) {
893914
; CHECK-NEXT: csrr a1, vlenb
894915
; CHECK-NEXT: slli a2, a1, 1
895916
; CHECK-NEXT: addi a2, a2, -1
896-
; CHECK-NEXT: bltu a0, a2, .LBB74_2
917+
; CHECK-NEXT: bltu a0, a2, .LBB76_2
897918
; CHECK-NEXT: # %bb.1:
898919
; CHECK-NEXT: mv a0, a2
899-
; CHECK-NEXT: .LBB74_2:
920+
; CHECK-NEXT: .LBB76_2:
900921
; CHECK-NEXT: addi sp, sp, -80
901922
; CHECK-NEXT: .cfi_def_cfa_offset 80
902923
; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1137,3 +1137,31 @@ define float @extractelt_fdiv_v4f32(<4 x float> %x) {
11371137
%ext = extractelement <4 x float> %bo, i32 2
11381138
ret float %ext
11391139
}
1140+
1141+
define i32 @extractelt_v16i32_idx7_exact_vlen(ptr %x) nounwind vscale_range(2,2) {
1142+
; CHECK-LABEL: extractelt_v16i32_idx7_exact_vlen:
1143+
; CHECK: # %bb.0:
1144+
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
1145+
; CHECK-NEXT: vle32.v v8, (a0)
1146+
; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
1147+
; CHECK-NEXT: vslidedown.vi v8, v9, 3
1148+
; CHECK-NEXT: vmv.x.s a0, v8
1149+
; CHECK-NEXT: ret
1150+
%a = load <16 x i32>, ptr %x
1151+
%b = extractelement <16 x i32> %a, i32 7
1152+
ret i32 %b
1153+
}
1154+
1155+
define i32 @extractelt_v16i32_idx15_exact_vlen(ptr %x) nounwind vscale_range(2,2) {
1156+
; CHECK-LABEL: extractelt_v16i32_idx15_exact_vlen:
1157+
; CHECK: # %bb.0:
1158+
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
1159+
; CHECK-NEXT: vle32.v v8, (a0)
1160+
; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma
1161+
; CHECK-NEXT: vslidedown.vi v8, v11, 3
1162+
; CHECK-NEXT: vmv.x.s a0, v8
1163+
; CHECK-NEXT: ret
1164+
%a = load <16 x i32>, ptr %x
1165+
%b = extractelement <16 x i32> %a, i32 15
1166+
ret i32 %b
1167+
}

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1084,3 +1084,133 @@ define i64 @explode_16xi64(<16 x i64> %v) {
10841084
%add14 = add i64 %add13, %e15
10851085
ret i64 %add14
10861086
}
1087+
1088+
define i32 @explode_16xi32_exact_vlen(<16 x i32> %v) vscale_range(2, 2) {
1089+
; RV32-LABEL: explode_16xi32_exact_vlen:
1090+
; RV32: # %bb.0:
1091+
; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
1092+
; RV32-NEXT: vslidedown.vi v12, v8, 2
1093+
; RV32-NEXT: vmv.x.s a0, v12
1094+
; RV32-NEXT: vslidedown.vi v12, v8, 3
1095+
; RV32-NEXT: vmv.x.s a1, v12
1096+
; RV32-NEXT: vmv.x.s a2, v9
1097+
; RV32-NEXT: vslidedown.vi v12, v9, 1
1098+
; RV32-NEXT: vmv.x.s a3, v12
1099+
; RV32-NEXT: vslidedown.vi v12, v9, 2
1100+
; RV32-NEXT: vmv.x.s a4, v12
1101+
; RV32-NEXT: vslidedown.vi v9, v9, 3
1102+
; RV32-NEXT: vmv.x.s a5, v9
1103+
; RV32-NEXT: vmv.x.s a6, v10
1104+
; RV32-NEXT: vslidedown.vi v9, v10, 1
1105+
; RV32-NEXT: vmv.x.s a7, v9
1106+
; RV32-NEXT: vslidedown.vi v9, v10, 2
1107+
; RV32-NEXT: vmv.x.s t0, v9
1108+
; RV32-NEXT: vslidedown.vi v9, v10, 3
1109+
; RV32-NEXT: vmv.x.s t1, v9
1110+
; RV32-NEXT: vmv.x.s t2, v11
1111+
; RV32-NEXT: vslidedown.vi v9, v11, 1
1112+
; RV32-NEXT: vmv.x.s t3, v9
1113+
; RV32-NEXT: vslidedown.vi v9, v11, 2
1114+
; RV32-NEXT: vmv.x.s t4, v9
1115+
; RV32-NEXT: vslidedown.vi v9, v11, 3
1116+
; RV32-NEXT: vmv.x.s t5, v9
1117+
; RV32-NEXT: vmv.s.x v9, zero
1118+
; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
1119+
; RV32-NEXT: vredxor.vs v8, v8, v9
1120+
; RV32-NEXT: vmv.x.s t6, v8
1121+
; RV32-NEXT: add a0, a0, a1
1122+
; RV32-NEXT: add a0, t6, a0
1123+
; RV32-NEXT: add a2, a2, a3
1124+
; RV32-NEXT: add a2, a2, a4
1125+
; RV32-NEXT: add a0, a0, a2
1126+
; RV32-NEXT: add a5, a5, a6
1127+
; RV32-NEXT: add a5, a5, a7
1128+
; RV32-NEXT: add a5, a5, t0
1129+
; RV32-NEXT: add a0, a0, a5
1130+
; RV32-NEXT: add t1, t1, t2
1131+
; RV32-NEXT: add t1, t1, t3
1132+
; RV32-NEXT: add t1, t1, t4
1133+
; RV32-NEXT: add t1, t1, t5
1134+
; RV32-NEXT: add a0, a0, t1
1135+
; RV32-NEXT: ret
1136+
;
1137+
; RV64-LABEL: explode_16xi32_exact_vlen:
1138+
; RV64: # %bb.0:
1139+
; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma
1140+
; RV64-NEXT: vslidedown.vi v12, v8, 2
1141+
; RV64-NEXT: vmv.x.s a0, v12
1142+
; RV64-NEXT: vslidedown.vi v12, v8, 3
1143+
; RV64-NEXT: vmv.x.s a1, v12
1144+
; RV64-NEXT: vmv.x.s a2, v9
1145+
; RV64-NEXT: vslidedown.vi v12, v9, 1
1146+
; RV64-NEXT: vmv.x.s a3, v12
1147+
; RV64-NEXT: vslidedown.vi v12, v9, 2
1148+
; RV64-NEXT: vmv.x.s a4, v12
1149+
; RV64-NEXT: vslidedown.vi v9, v9, 3
1150+
; RV64-NEXT: vmv.x.s a5, v9
1151+
; RV64-NEXT: vmv.x.s a6, v10
1152+
; RV64-NEXT: vslidedown.vi v9, v10, 1
1153+
; RV64-NEXT: vmv.x.s a7, v9
1154+
; RV64-NEXT: vslidedown.vi v9, v10, 2
1155+
; RV64-NEXT: vmv.x.s t0, v9
1156+
; RV64-NEXT: vslidedown.vi v9, v10, 3
1157+
; RV64-NEXT: vmv.x.s t1, v9
1158+
; RV64-NEXT: vmv.x.s t2, v11
1159+
; RV64-NEXT: vslidedown.vi v9, v11, 1
1160+
; RV64-NEXT: vmv.x.s t3, v9
1161+
; RV64-NEXT: vslidedown.vi v9, v11, 2
1162+
; RV64-NEXT: vmv.x.s t4, v9
1163+
; RV64-NEXT: vslidedown.vi v9, v11, 3
1164+
; RV64-NEXT: vmv.x.s t5, v9
1165+
; RV64-NEXT: vmv.s.x v9, zero
1166+
; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
1167+
; RV64-NEXT: vredxor.vs v8, v8, v9
1168+
; RV64-NEXT: vmv.x.s t6, v8
1169+
; RV64-NEXT: add a0, a0, a1
1170+
; RV64-NEXT: add a0, t6, a0
1171+
; RV64-NEXT: add a2, a2, a3
1172+
; RV64-NEXT: add a2, a2, a4
1173+
; RV64-NEXT: add a0, a0, a2
1174+
; RV64-NEXT: add a5, a5, a6
1175+
; RV64-NEXT: add a5, a5, a7
1176+
; RV64-NEXT: add a5, a5, t0
1177+
; RV64-NEXT: add a0, a0, a5
1178+
; RV64-NEXT: add t1, t1, t2
1179+
; RV64-NEXT: add t1, t1, t3
1180+
; RV64-NEXT: add t1, t1, t4
1181+
; RV64-NEXT: add t1, t1, t5
1182+
; RV64-NEXT: addw a0, a0, t1
1183+
; RV64-NEXT: ret
1184+
%e0 = extractelement <16 x i32> %v, i32 0
1185+
%e1 = extractelement <16 x i32> %v, i32 1
1186+
%e2 = extractelement <16 x i32> %v, i32 2
1187+
%e3 = extractelement <16 x i32> %v, i32 3
1188+
%e4 = extractelement <16 x i32> %v, i32 4
1189+
%e5 = extractelement <16 x i32> %v, i32 5
1190+
%e6 = extractelement <16 x i32> %v, i32 6
1191+
%e7 = extractelement <16 x i32> %v, i32 7
1192+
%e8 = extractelement <16 x i32> %v, i32 8
1193+
%e9 = extractelement <16 x i32> %v, i32 9
1194+
%e10 = extractelement <16 x i32> %v, i32 10
1195+
%e11 = extractelement <16 x i32> %v, i32 11
1196+
%e12 = extractelement <16 x i32> %v, i32 12
1197+
%e13 = extractelement <16 x i32> %v, i32 13
1198+
%e14 = extractelement <16 x i32> %v, i32 14
1199+
%e15 = extractelement <16 x i32> %v, i32 15
1200+
%add0 = xor i32 %e0, %e1
1201+
%add1 = add i32 %add0, %e2
1202+
%add2 = add i32 %add1, %e3
1203+
%add3 = add i32 %add2, %e4
1204+
%add4 = add i32 %add3, %e5
1205+
%add5 = add i32 %add4, %e6
1206+
%add6 = add i32 %add5, %e7
1207+
%add7 = add i32 %add6, %e8
1208+
%add8 = add i32 %add7, %e9
1209+
%add9 = add i32 %add8, %e10
1210+
%add10 = add i32 %add9, %e11
1211+
%add11 = add i32 %add10, %e12
1212+
%add12 = add i32 %add11, %e13
1213+
%add13 = add i32 %add12, %e14
1214+
%add14 = add i32 %add13, %e15
1215+
ret i32 %add14
1216+
}

0 commit comments

Comments
 (0)