@@ -22,6 +22,8 @@ class SiFiveP400IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit is
22
22
bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
23
23
}
24
24
25
+ defvar SiFiveP400VLEN = 128;
26
+
25
27
// 1 Micro-Op per cycle.
26
28
class SiFiveP400GetLMulCycles<string mx> {
27
29
int c = !cond(
@@ -35,19 +37,31 @@ class SiFiveP400GetLMulCycles<string mx> {
35
37
);
36
38
}
37
39
38
- // Latency for segmented loads and stores are calculated as vl * nf.
39
- class SiFiveP400GetCyclesSegmented<string mx, int sew, int nf> {
40
- defvar VLEN = 128;
41
- defvar VLUpperBound = !cond(
42
- !eq(mx, "M1") : !div(VLEN, sew),
43
- !eq(mx, "M2") : !div(!mul(VLEN, 2), sew),
44
- !eq(mx, "M4") : !div(!mul(VLEN, 4), sew),
45
- !eq(mx, "M8") : !div(!mul(VLEN, 8), sew),
46
- !eq(mx, "MF2") : !div(!div(VLEN, 2), sew),
47
- !eq(mx, "MF4") : !div(!div(VLEN, 4), sew),
48
- !eq(mx, "MF8") : !div(!div(VLEN, 8), sew),
40
+ class SiFiveP400GetVLMAX<string mx, int sew> {
41
+ defvar LMUL = SiFiveP400GetLMulCycles<mx>.c;
42
+ int val = !cond(
43
+ !eq(mx, "MF2") : !div(!div(SiFiveP400VLEN, 2), sew),
44
+ !eq(mx, "MF4") : !div(!div(SiFiveP400VLEN, 4), sew),
45
+ !eq(mx, "MF8") : !div(!div(SiFiveP400VLEN, 8), sew),
46
+ true: !div(!mul(SiFiveP400VLEN, LMUL), sew)
47
+ );
48
+ }
49
+
50
+ class SiFiveP400StridedLdStLatency<string mx, int sew> {
51
+ defvar VL = SiFiveP400GetVLMAX<mx, sew>.val;
52
+ int val = !cond(
53
+ !eq(VL, 2): 13,
54
+ !eq(VL, 4): 18,
55
+ !eq(VL, 8): 22,
56
+ !eq(VL, 16): 30,
57
+ // VL=32,64,128
58
+ true: !sub(VL, 2)
49
59
);
50
- int c = !mul(VLUpperBound, nf);
60
+ }
61
+
62
+ // Latency for segmented loads and stores are calculated as vl * nf.
63
+ class SiFiveP400SegmentedLdStCycles<string mx, int sew, int nf> {
64
+ int c = !mul(SiFiveP400GetVLMAX<mx, sew>.val, nf);
51
65
}
52
66
53
67
// Both variants of floating point vector reductions are based on numbers collected
@@ -368,65 +382,45 @@ def : WriteRes<WriteVSETIVLI, [SiFiveP400SYS]>;
368
382
def : WriteRes<WriteVSETVL, [SiFiveP400SYS]>;
369
383
370
384
// 7. Vector Loads and Stores
371
- // FIXME: This unit is still being improved, currently
372
- // it is based on stage numbers. Estimates are optimistic,
373
- // latency may be longer.
374
- foreach mx = SchedMxList in {
375
- defvar LMulLat = SiFiveP400GetLMulCycles<mx>.c;
376
- defvar IsWorstCase = SiFiveP400IsWorstCaseMX<mx, SchedMxList>.c;
377
- let Latency = 8, ReleaseAtCycles = [LMulLat] in {
378
- defm "" : LMULWriteResMX<"WriteVLDE", [SiFiveP400VLD], mx, IsWorstCase>;
379
- defm "" : LMULWriteResMX<"WriteVLDM", [SiFiveP400VLD], mx, IsWorstCase>;
380
- defm "" : LMULWriteResMX<"WriteVLDFF", [SiFiveP400VLD], mx, IsWorstCase>;
381
- }
382
- let Latency = 12, ReleaseAtCycles = [LMulLat] in {
383
- defm "" : LMULWriteResMX<"WriteVLDS8", [SiFiveP400VLD], mx, IsWorstCase>;
384
- defm "" : LMULWriteResMX<"WriteVLDS16", [SiFiveP400VLD], mx, IsWorstCase>;
385
- defm "" : LMULWriteResMX<"WriteVLDS32", [SiFiveP400VLD], mx, IsWorstCase>;
386
- defm "" : LMULWriteResMX<"WriteVLDS64", [SiFiveP400VLD], mx, IsWorstCase>;
387
- }
388
- let Latency = 12, ReleaseAtCycles = [LMulLat] in {
389
- defm "" : LMULWriteResMX<"WriteVLDUX8", [SiFiveP400VLD], mx, IsWorstCase>;
390
- defm "" : LMULWriteResMX<"WriteVLDUX16", [SiFiveP400VLD], mx, IsWorstCase>;
391
- defm "" : LMULWriteResMX<"WriteVLDUX32", [SiFiveP400VLD], mx, IsWorstCase>;
392
- defm "" : LMULWriteResMX<"WriteVLDUX64", [SiFiveP400VLD], mx, IsWorstCase>;
393
- defm "" : LMULWriteResMX<"WriteVLDOX8", [SiFiveP400VLD], mx, IsWorstCase>;
394
- defm "" : LMULWriteResMX<"WriteVLDOX16", [SiFiveP400VLD], mx, IsWorstCase>;
395
- defm "" : LMULWriteResMX<"WriteVLDOX32", [SiFiveP400VLD], mx, IsWorstCase>;
396
- defm "" : LMULWriteResMX<"WriteVLDOX64", [SiFiveP400VLD], mx, IsWorstCase>;
397
- }
398
- }
399
385
386
+ // Note that the latency of vector loads are measured by consuming the loaded
387
+ // value with vmv.x.s before subtracting the latency of vmv.x.s from the number.
400
388
foreach mx = SchedMxList in {
401
389
defvar LMulLat = SiFiveP400GetLMulCycles<mx>.c;
402
390
defvar IsWorstCase = SiFiveP400IsWorstCaseMX<mx, SchedMxList>.c;
403
- let Latency = 8, ReleaseAtCycles = [LMulLat] in {
404
- defm "" : LMULWriteResMX<"WriteVSTE", [SiFiveP400VST], mx, IsWorstCase>;
405
- defm "" : LMULWriteResMX<"WriteVSTM", [SiFiveP400VST], mx, IsWorstCase>;
406
- }
407
- let Latency = 12, ReleaseAtCycles = [LMulLat] in {
408
- defm "" : LMULWriteResMX<"WriteVSTS8", [SiFiveP400VST], mx, IsWorstCase>;
409
- defm "" : LMULWriteResMX<"WriteVSTS16", [SiFiveP400VST], mx, IsWorstCase>;
410
- defm "" : LMULWriteResMX<"WriteVSTS32", [SiFiveP400VST], mx, IsWorstCase>;
411
- defm "" : LMULWriteResMX<"WriteVSTS64", [SiFiveP400VST], mx, IsWorstCase>;
391
+ let Latency = 8 in {
392
+ let ReleaseAtCycles = [LMulLat] in {
393
+ defm "" : LMULWriteResMX<"WriteVLDE", [SiFiveP400VLD], mx, IsWorstCase>;
394
+ defm "" : LMULWriteResMX<"WriteVLDFF", [SiFiveP400VLD], mx, IsWorstCase>;
395
+
396
+ defm "" : LMULWriteResMX<"WriteVSTE", [SiFiveP400VST], mx, IsWorstCase>;
397
+ }
398
+
399
+ // Mask load and store have a maximum EMUL of 1.
400
+ let ReleaseAtCycles = [SiFiveP400GetLMulCycles<"M1">.c] in {
401
+ defm "" : LMULWriteResMX<"WriteVLDM", [SiFiveP400VLD], mx, IsWorstCase=!eq(mx, "M1")>;
402
+ defm "" : LMULWriteResMX<"WriteVSTM", [SiFiveP400VST], mx, IsWorstCase=!eq(mx, "M1")>;
403
+ }
412
404
}
413
- let Latency = 12, ReleaseAtCycles = [LMulLat] in {
414
- defm "" : LMULWriteResMX<"WriteVSTUX8", [SiFiveP400VST], mx, IsWorstCase>;
415
- defm "" : LMULWriteResMX<"WriteVSTUX16", [SiFiveP400VST], mx, IsWorstCase>;
416
- defm "" : LMULWriteResMX<"WriteVSTUX32", [SiFiveP400VST], mx, IsWorstCase>;
417
- defm "" : LMULWriteResMX<"WriteVSTUX64", [SiFiveP400VST], mx, IsWorstCase>;
418
- defm "" : LMULWriteResMX<"WriteVSTOX8", [SiFiveP400VST], mx, IsWorstCase>;
419
- defm "" : LMULWriteResMX<"WriteVSTOX16", [SiFiveP400VST], mx, IsWorstCase>;
420
- defm "" : LMULWriteResMX<"WriteVSTOX32", [SiFiveP400VST], mx, IsWorstCase>;
421
- defm "" : LMULWriteResMX<"WriteVSTOX64", [SiFiveP400VST], mx, IsWorstCase>;
405
+ foreach eew = [8, 16, 32, 64] in {
406
+ let Latency = SiFiveP400StridedLdStLatency<mx, eew>.val,
407
+ ReleaseAtCycles = [SiFiveP400GetVLMAX<mx, eew>.val] in {
408
+ defm "" : LMULWriteResMX<"WriteVLDS" # eew, [SiFiveP400VLD], mx, IsWorstCase>;
409
+ defm "" : LMULWriteResMX<"WriteVLDUX" # eew, [SiFiveP400VLD], mx, IsWorstCase>;
410
+ defm "" : LMULWriteResMX<"WriteVLDOX" # eew, [SiFiveP400VLD], mx, IsWorstCase>;
411
+
412
+ defm "" : LMULWriteResMX<"WriteVSTS" # eew, [SiFiveP400VST], mx, IsWorstCase>;
413
+ defm "" : LMULWriteResMX<"WriteVSTUX" # eew, [SiFiveP400VST], mx, IsWorstCase>;
414
+ defm "" : LMULWriteResMX<"WriteVSTOX" # eew, [SiFiveP400VST], mx, IsWorstCase>;
415
+ }
422
416
}
423
417
}
424
418
425
419
foreach mx = SchedMxList in {
426
420
foreach nf=2-8 in {
427
421
foreach eew = [8, 16, 32, 64] in {
428
422
defvar IsWorstCase = SiFiveP400IsWorstCaseMX<mx, SchedMxList>.c;
429
- defvar LMulLat = SiFiveP400GetCyclesSegmented <mx, eew, nf>.c;
423
+ defvar LMulLat = SiFiveP400SegmentedLdStCycles <mx, eew, nf>.c;
430
424
let Latency = !add(12, LMulLat), ReleaseAtCycles = [!add(12, LMulLat)] in {
431
425
defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" #eew, [SiFiveP400VLD], mx, IsWorstCase>;
432
426
defm "" : LMULWriteResMX<"WriteVLSEGFF" # nf # "e" #eew, [SiFiveP400VLD], mx, IsWorstCase>;
0 commit comments