Skip to content

Commit 5bf3f08

Browse files
authored
[RISCV] Update some of the RVV memory ops in SiFive P400 & P600 sched models (llvm#129575)
This patch updates the latencies as well as occupancies of unit stride, strided, and indexed load/store instructions in SiFive P400 & P600 scheduling models.
1 parent 5f86666 commit 5bf3f08

File tree

8 files changed

+3007
-118
lines changed

8 files changed

+3007
-118
lines changed

llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td

Lines changed: 53 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ class SiFiveP400IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit is
2222
bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
2323
}
2424

25+
defvar SiFiveP400VLEN = 128;
26+
2527
// 1 Micro-Op per cycle.
2628
class SiFiveP400GetLMulCycles<string mx> {
2729
int c = !cond(
@@ -35,19 +37,31 @@ class SiFiveP400GetLMulCycles<string mx> {
3537
);
3638
}
3739

38-
// Latency for segmented loads and stores are calculated as vl * nf.
39-
class SiFiveP400GetCyclesSegmented<string mx, int sew, int nf> {
40-
defvar VLEN = 128;
41-
defvar VLUpperBound = !cond(
42-
!eq(mx, "M1") : !div(VLEN, sew),
43-
!eq(mx, "M2") : !div(!mul(VLEN, 2), sew),
44-
!eq(mx, "M4") : !div(!mul(VLEN, 4), sew),
45-
!eq(mx, "M8") : !div(!mul(VLEN, 8), sew),
46-
!eq(mx, "MF2") : !div(!div(VLEN, 2), sew),
47-
!eq(mx, "MF4") : !div(!div(VLEN, 4), sew),
48-
!eq(mx, "MF8") : !div(!div(VLEN, 8), sew),
40+
class SiFiveP400GetVLMAX<string mx, int sew> {
41+
defvar LMUL = SiFiveP400GetLMulCycles<mx>.c;
42+
int val = !cond(
43+
!eq(mx, "MF2") : !div(!div(SiFiveP400VLEN, 2), sew),
44+
!eq(mx, "MF4") : !div(!div(SiFiveP400VLEN, 4), sew),
45+
!eq(mx, "MF8") : !div(!div(SiFiveP400VLEN, 8), sew),
46+
true: !div(!mul(SiFiveP400VLEN, LMUL), sew)
47+
);
48+
}
49+
50+
class SiFiveP400StridedLdStLatency<string mx, int sew> {
51+
defvar VL = SiFiveP400GetVLMAX<mx, sew>.val;
52+
int val = !cond(
53+
!eq(VL, 2): 13,
54+
!eq(VL, 4): 18,
55+
!eq(VL, 8): 22,
56+
!eq(VL, 16): 30,
57+
// VL=32,64,128
58+
true: !sub(VL, 2)
4959
);
50-
int c = !mul(VLUpperBound, nf);
60+
}
61+
62+
// Latency for segmented loads and stores are calculated as vl * nf.
63+
class SiFiveP400SegmentedLdStCycles<string mx, int sew, int nf> {
64+
int c = !mul(SiFiveP400GetVLMAX<mx, sew>.val, nf);
5165
}
5266

5367
// Both variants of floating point vector reductions are based on numbers collected
@@ -368,65 +382,45 @@ def : WriteRes<WriteVSETIVLI, [SiFiveP400SYS]>;
368382
def : WriteRes<WriteVSETVL, [SiFiveP400SYS]>;
369383

370384
// 7. Vector Loads and Stores
371-
// FIXME: This unit is still being improved, currently
372-
// it is based on stage numbers. Estimates are optimistic,
373-
// latency may be longer.
374-
foreach mx = SchedMxList in {
375-
defvar LMulLat = SiFiveP400GetLMulCycles<mx>.c;
376-
defvar IsWorstCase = SiFiveP400IsWorstCaseMX<mx, SchedMxList>.c;
377-
let Latency = 8, ReleaseAtCycles = [LMulLat] in {
378-
defm "" : LMULWriteResMX<"WriteVLDE", [SiFiveP400VLD], mx, IsWorstCase>;
379-
defm "" : LMULWriteResMX<"WriteVLDM", [SiFiveP400VLD], mx, IsWorstCase>;
380-
defm "" : LMULWriteResMX<"WriteVLDFF", [SiFiveP400VLD], mx, IsWorstCase>;
381-
}
382-
let Latency = 12, ReleaseAtCycles = [LMulLat] in {
383-
defm "" : LMULWriteResMX<"WriteVLDS8", [SiFiveP400VLD], mx, IsWorstCase>;
384-
defm "" : LMULWriteResMX<"WriteVLDS16", [SiFiveP400VLD], mx, IsWorstCase>;
385-
defm "" : LMULWriteResMX<"WriteVLDS32", [SiFiveP400VLD], mx, IsWorstCase>;
386-
defm "" : LMULWriteResMX<"WriteVLDS64", [SiFiveP400VLD], mx, IsWorstCase>;
387-
}
388-
let Latency = 12, ReleaseAtCycles = [LMulLat] in {
389-
defm "" : LMULWriteResMX<"WriteVLDUX8", [SiFiveP400VLD], mx, IsWorstCase>;
390-
defm "" : LMULWriteResMX<"WriteVLDUX16", [SiFiveP400VLD], mx, IsWorstCase>;
391-
defm "" : LMULWriteResMX<"WriteVLDUX32", [SiFiveP400VLD], mx, IsWorstCase>;
392-
defm "" : LMULWriteResMX<"WriteVLDUX64", [SiFiveP400VLD], mx, IsWorstCase>;
393-
defm "" : LMULWriteResMX<"WriteVLDOX8", [SiFiveP400VLD], mx, IsWorstCase>;
394-
defm "" : LMULWriteResMX<"WriteVLDOX16", [SiFiveP400VLD], mx, IsWorstCase>;
395-
defm "" : LMULWriteResMX<"WriteVLDOX32", [SiFiveP400VLD], mx, IsWorstCase>;
396-
defm "" : LMULWriteResMX<"WriteVLDOX64", [SiFiveP400VLD], mx, IsWorstCase>;
397-
}
398-
}
399385

386+
// Note that the latency of vector loads are measured by consuming the loaded
387+
// value with vmv.x.s before subtracting the latency of vmv.x.s from the number.
400388
foreach mx = SchedMxList in {
401389
defvar LMulLat = SiFiveP400GetLMulCycles<mx>.c;
402390
defvar IsWorstCase = SiFiveP400IsWorstCaseMX<mx, SchedMxList>.c;
403-
let Latency = 8, ReleaseAtCycles = [LMulLat] in {
404-
defm "" : LMULWriteResMX<"WriteVSTE", [SiFiveP400VST], mx, IsWorstCase>;
405-
defm "" : LMULWriteResMX<"WriteVSTM", [SiFiveP400VST], mx, IsWorstCase>;
406-
}
407-
let Latency = 12, ReleaseAtCycles = [LMulLat] in {
408-
defm "" : LMULWriteResMX<"WriteVSTS8", [SiFiveP400VST], mx, IsWorstCase>;
409-
defm "" : LMULWriteResMX<"WriteVSTS16", [SiFiveP400VST], mx, IsWorstCase>;
410-
defm "" : LMULWriteResMX<"WriteVSTS32", [SiFiveP400VST], mx, IsWorstCase>;
411-
defm "" : LMULWriteResMX<"WriteVSTS64", [SiFiveP400VST], mx, IsWorstCase>;
391+
let Latency = 8 in {
392+
let ReleaseAtCycles = [LMulLat] in {
393+
defm "" : LMULWriteResMX<"WriteVLDE", [SiFiveP400VLD], mx, IsWorstCase>;
394+
defm "" : LMULWriteResMX<"WriteVLDFF", [SiFiveP400VLD], mx, IsWorstCase>;
395+
396+
defm "" : LMULWriteResMX<"WriteVSTE", [SiFiveP400VST], mx, IsWorstCase>;
397+
}
398+
399+
// Mask load and store have a maximum EMUL of 1.
400+
let ReleaseAtCycles = [SiFiveP400GetLMulCycles<"M1">.c] in {
401+
defm "" : LMULWriteResMX<"WriteVLDM", [SiFiveP400VLD], mx, IsWorstCase=!eq(mx, "M1")>;
402+
defm "" : LMULWriteResMX<"WriteVSTM", [SiFiveP400VST], mx, IsWorstCase=!eq(mx, "M1")>;
403+
}
412404
}
413-
let Latency = 12, ReleaseAtCycles = [LMulLat] in {
414-
defm "" : LMULWriteResMX<"WriteVSTUX8", [SiFiveP400VST], mx, IsWorstCase>;
415-
defm "" : LMULWriteResMX<"WriteVSTUX16", [SiFiveP400VST], mx, IsWorstCase>;
416-
defm "" : LMULWriteResMX<"WriteVSTUX32", [SiFiveP400VST], mx, IsWorstCase>;
417-
defm "" : LMULWriteResMX<"WriteVSTUX64", [SiFiveP400VST], mx, IsWorstCase>;
418-
defm "" : LMULWriteResMX<"WriteVSTOX8", [SiFiveP400VST], mx, IsWorstCase>;
419-
defm "" : LMULWriteResMX<"WriteVSTOX16", [SiFiveP400VST], mx, IsWorstCase>;
420-
defm "" : LMULWriteResMX<"WriteVSTOX32", [SiFiveP400VST], mx, IsWorstCase>;
421-
defm "" : LMULWriteResMX<"WriteVSTOX64", [SiFiveP400VST], mx, IsWorstCase>;
405+
foreach eew = [8, 16, 32, 64] in {
406+
let Latency = SiFiveP400StridedLdStLatency<mx, eew>.val,
407+
ReleaseAtCycles = [SiFiveP400GetVLMAX<mx, eew>.val] in {
408+
defm "" : LMULWriteResMX<"WriteVLDS" # eew, [SiFiveP400VLD], mx, IsWorstCase>;
409+
defm "" : LMULWriteResMX<"WriteVLDUX" # eew, [SiFiveP400VLD], mx, IsWorstCase>;
410+
defm "" : LMULWriteResMX<"WriteVLDOX" # eew, [SiFiveP400VLD], mx, IsWorstCase>;
411+
412+
defm "" : LMULWriteResMX<"WriteVSTS" # eew, [SiFiveP400VST], mx, IsWorstCase>;
413+
defm "" : LMULWriteResMX<"WriteVSTUX" # eew, [SiFiveP400VST], mx, IsWorstCase>;
414+
defm "" : LMULWriteResMX<"WriteVSTOX" # eew, [SiFiveP400VST], mx, IsWorstCase>;
415+
}
422416
}
423417
}
424418

425419
foreach mx = SchedMxList in {
426420
foreach nf=2-8 in {
427421
foreach eew = [8, 16, 32, 64] in {
428422
defvar IsWorstCase = SiFiveP400IsWorstCaseMX<mx, SchedMxList>.c;
429-
defvar LMulLat = SiFiveP400GetCyclesSegmented<mx, eew, nf>.c;
423+
defvar LMulLat = SiFiveP400SegmentedLdStCycles<mx, eew, nf>.c;
430424
let Latency = !add(12, LMulLat), ReleaseAtCycles = [!add(12, LMulLat)] in {
431425
defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" #eew, [SiFiveP400VLD], mx, IsWorstCase>;
432426
defm "" : LMULWriteResMX<"WriteVLSEGFF" # nf # "e" #eew, [SiFiveP400VLD], mx, IsWorstCase>;

llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td

Lines changed: 53 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ class SiFiveP600IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit is
2222
bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
2323
}
2424

25+
defvar SiFiveP600VLEN = 128;
26+
2527
// 1 Micro-Op per cycle.
2628
class SiFiveP600GetLMulCycles<string mx> {
2729
int c = !cond(
@@ -35,19 +37,31 @@ class SiFiveP600GetLMulCycles<string mx> {
3537
);
3638
}
3739

38-
// Latency for segmented loads and stores are calculated as vl * nf.
39-
class SiFiveP600GetCyclesSegmented<string mx, int sew, int nf> {
40-
defvar VLEN = 128;
41-
defvar VLUpperBound = !cond(
42-
!eq(mx, "M1") : !div(VLEN, sew),
43-
!eq(mx, "M2") : !div(!mul(VLEN, 2), sew),
44-
!eq(mx, "M4") : !div(!mul(VLEN, 4), sew),
45-
!eq(mx, "M8") : !div(!mul(VLEN, 8), sew),
46-
!eq(mx, "MF2") : !div(!div(VLEN, 2), sew),
47-
!eq(mx, "MF4") : !div(!div(VLEN, 4), sew),
48-
!eq(mx, "MF8") : !div(!div(VLEN, 8), sew),
40+
class SiFiveP600GetVLMAX<string mx, int sew> {
41+
defvar LMUL = SiFiveP600GetLMulCycles<mx>.c;
42+
int val = !cond(
43+
!eq(mx, "MF2") : !div(!div(SiFiveP600VLEN, 2), sew),
44+
!eq(mx, "MF4") : !div(!div(SiFiveP600VLEN, 4), sew),
45+
!eq(mx, "MF8") : !div(!div(SiFiveP600VLEN, 8), sew),
46+
true: !div(!mul(SiFiveP600VLEN, LMUL), sew)
47+
);
48+
}
49+
50+
class SiFiveP600StridedLdStLatency<string mx, int sew> {
51+
defvar VL = SiFiveP400GetVLMAX<mx, sew>.val;
52+
int val = !cond(
53+
!eq(VL, 2): 13,
54+
!eq(VL, 4): 18,
55+
!eq(VL, 8): 22,
56+
!eq(VL, 16): 30,
57+
// VL=32,64,128
58+
true: !sub(VL, 2)
4959
);
50-
int c = !mul(VLUpperBound, nf);
60+
}
61+
62+
// Latency for segmented loads and stores are calculated as vl * nf.
63+
class SiFiveP600SegmentedLdStCycles<string mx, int sew, int nf> {
64+
int c = !mul(SiFiveP600GetVLMAX<mx, sew>.val, nf);
5165
}
5266

5367
class SiFiveP600VSM3CCycles<string mx> {
@@ -544,64 +558,44 @@ def : WriteRes<WriteVSETIVLI, [SiFiveP600SYS]>;
544558
def : WriteRes<WriteVSETVL, [SiFiveP600SYS]>;
545559

546560
// 7. Vector Loads and Stores
547-
// FIXME: This unit is still being improved, currently
548-
// it is based on stage numbers. Estimates are optimistic,
549-
// latency may be longer.
550-
foreach mx = SchedMxList in {
551-
defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
552-
defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
553-
let Latency = 8, ReleaseAtCycles = [LMulLat] in {
554-
defm "" : LMULWriteResMX<"WriteVLDE", [SiFiveP600VLD], mx, IsWorstCase>;
555-
defm "" : LMULWriteResMX<"WriteVLDM", [SiFiveP600VLD], mx, IsWorstCase>;
556-
defm "" : LMULWriteResMX<"WriteVLDFF", [SiFiveP600VLD], mx, IsWorstCase>;
557-
}
558-
let Latency = 12, ReleaseAtCycles = [LMulLat] in {
559-
defm "" : LMULWriteResMX<"WriteVLDS8", [SiFiveP600VLD], mx, IsWorstCase>;
560-
defm "" : LMULWriteResMX<"WriteVLDS16", [SiFiveP600VLD], mx, IsWorstCase>;
561-
defm "" : LMULWriteResMX<"WriteVLDS32", [SiFiveP600VLD], mx, IsWorstCase>;
562-
defm "" : LMULWriteResMX<"WriteVLDS64", [SiFiveP600VLD], mx, IsWorstCase>;
563-
}
564-
let Latency = 12, ReleaseAtCycles = [LMulLat] in {
565-
defm "" : LMULWriteResMX<"WriteVLDUX8", [SiFiveP600VLD], mx, IsWorstCase>;
566-
defm "" : LMULWriteResMX<"WriteVLDUX16", [SiFiveP600VLD], mx, IsWorstCase>;
567-
defm "" : LMULWriteResMX<"WriteVLDUX32", [SiFiveP600VLD], mx, IsWorstCase>;
568-
defm "" : LMULWriteResMX<"WriteVLDUX64", [SiFiveP600VLD], mx, IsWorstCase>;
569-
defm "" : LMULWriteResMX<"WriteVLDOX8", [SiFiveP600VLD], mx, IsWorstCase>;
570-
defm "" : LMULWriteResMX<"WriteVLDOX16", [SiFiveP600VLD], mx, IsWorstCase>;
571-
defm "" : LMULWriteResMX<"WriteVLDOX32", [SiFiveP600VLD], mx, IsWorstCase>;
572-
defm "" : LMULWriteResMX<"WriteVLDOX64", [SiFiveP600VLD], mx, IsWorstCase>;
573-
}
574-
}
575561

562+
// Note that the latency of vector loads are measured by consuming the loaded
563+
// value with vmv.x.s before subtracting the latency of vmv.x.s from the number.
576564
foreach mx = SchedMxList in {
577565
defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
578566
defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
579-
let Latency = 8, ReleaseAtCycles = [LMulLat] in {
580-
defm "" : LMULWriteResMX<"WriteVSTE", [SiFiveP600VST], mx, IsWorstCase>;
581-
defm "" : LMULWriteResMX<"WriteVSTM", [SiFiveP600VST], mx, IsWorstCase>;
582-
}
583-
let Latency = 12, ReleaseAtCycles = [LMulLat] in {
584-
defm "" : LMULWriteResMX<"WriteVSTS8", [SiFiveP600VST], mx, IsWorstCase>;
585-
defm "" : LMULWriteResMX<"WriteVSTS16", [SiFiveP600VST], mx, IsWorstCase>;
586-
defm "" : LMULWriteResMX<"WriteVSTS32", [SiFiveP600VST], mx, IsWorstCase>;
587-
defm "" : LMULWriteResMX<"WriteVSTS64", [SiFiveP600VST], mx, IsWorstCase>;
567+
let Latency = 8 in {
568+
let ReleaseAtCycles = [LMulLat] in {
569+
defm "" : LMULWriteResMX<"WriteVLDE", [SiFiveP600VLD], mx, IsWorstCase>;
570+
defm "" : LMULWriteResMX<"WriteVLDFF", [SiFiveP600VLD], mx, IsWorstCase>;
571+
572+
defm "" : LMULWriteResMX<"WriteVSTE", [SiFiveP600VST], mx, IsWorstCase>;
573+
}
574+
575+
// Mask load and store have a maximum EMUL of 1.
576+
let ReleaseAtCycles = [SiFiveP600GetLMulCycles<"M1">.c] in {
577+
defm "" : LMULWriteResMX<"WriteVLDM", [SiFiveP600VLD], mx, IsWorstCase=!eq(mx,"M1")>;
578+
defm "" : LMULWriteResMX<"WriteVSTM", [SiFiveP600VST], mx, IsWorstCase=!eq(mx,"M1")>;
579+
}
588580
}
589-
let Latency = 12, ReleaseAtCycles = [LMulLat] in {
590-
defm "" : LMULWriteResMX<"WriteVSTUX8", [SiFiveP600VST], mx, IsWorstCase>;
591-
defm "" : LMULWriteResMX<"WriteVSTUX16", [SiFiveP600VST], mx, IsWorstCase>;
592-
defm "" : LMULWriteResMX<"WriteVSTUX32", [SiFiveP600VST], mx, IsWorstCase>;
593-
defm "" : LMULWriteResMX<"WriteVSTUX64", [SiFiveP600VST], mx, IsWorstCase>;
594-
defm "" : LMULWriteResMX<"WriteVSTOX8", [SiFiveP600VST], mx, IsWorstCase>;
595-
defm "" : LMULWriteResMX<"WriteVSTOX16", [SiFiveP600VST], mx, IsWorstCase>;
596-
defm "" : LMULWriteResMX<"WriteVSTOX32", [SiFiveP600VST], mx, IsWorstCase>;
597-
defm "" : LMULWriteResMX<"WriteVSTOX64", [SiFiveP600VST], mx, IsWorstCase>;
581+
foreach eew = [8, 16, 32, 64] in {
582+
let Latency = SiFiveP600StridedLdStLatency<mx, eew>.val,
583+
ReleaseAtCycles = [SiFiveP600GetVLMAX<mx, eew>.val] in {
584+
defm "" : LMULWriteResMX<"WriteVLDS" # eew, [SiFiveP600VLD], mx, IsWorstCase>;
585+
defm "" : LMULWriteResMX<"WriteVLDUX" # eew, [SiFiveP600VLD], mx, IsWorstCase>;
586+
defm "" : LMULWriteResMX<"WriteVLDOX" # eew, [SiFiveP600VLD], mx, IsWorstCase>;
587+
588+
defm "" : LMULWriteResMX<"WriteVSTS" # eew, [SiFiveP600VST], mx, IsWorstCase>;
589+
defm "" : LMULWriteResMX<"WriteVSTUX" # eew, [SiFiveP600VST], mx, IsWorstCase>;
590+
defm "" : LMULWriteResMX<"WriteVSTOX" # eew, [SiFiveP600VST], mx, IsWorstCase>;
591+
}
598592
}
599593
}
600594

601595
foreach mx = SchedMxList in {
602596
foreach nf=2-8 in {
603597
foreach eew = [8, 16, 32, 64] in {
604-
defvar LMulLat = SiFiveP600GetCyclesSegmented<mx, eew, nf>.c;
598+
defvar LMulLat = SiFiveP600SegmentedLdStCycles<mx, eew, nf>.c;
605599
defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
606600
let Latency = !add(12, LMulLat), ReleaseAtCycles = [!add(12, LMulLat)] in {
607601
defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" # eew, [SiFiveP600VLD], mx, IsWorstCase>;

0 commit comments

Comments
 (0)