@@ -26,60 +26,53 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
26
26
; CHECK-LABEL: fcvt_x4_tuple:
27
27
; CHECK: // %bb.0: // %entry
28
28
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
29
- ; CHECK-NEXT: addvl sp, sp, #-10
29
+ ; CHECK-NEXT: addvl sp, sp, #-9
30
30
; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
31
- ; CHECK-NEXT: ptrue pn8.b
32
- ; CHECK-NEXT: str z19 , [sp, #1 , mul vl] // 16-byte Folded Spill
33
- ; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #6 , mul vl] // 32 -byte Folded Spill
34
- ; CHECK-NEXT: str z18 , [sp, #2 , mul vl] // 16-byte Folded Spill
35
- ; CHECK-NEXT: str z17 , [sp, #3 , mul vl] // 16-byte Folded Spill
36
- ; CHECK-NEXT: str z16 , [sp, #4 , mul vl] // 16-byte Folded Spill
37
- ; CHECK-NEXT: str z14 , [sp, #5 , mul vl] // 16-byte Folded Spill
38
- ; CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #8, mul vl] // 32 -byte Folded Spill
39
- ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd0 , 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 80 * VG
31
+ ; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill
32
+ ; CHECK-NEXT: str z14 , [sp, #2 , mul vl] // 16-byte Folded Spill
33
+ ; CHECK-NEXT: str z13, [sp, #3 , mul vl] // 16 -byte Folded Spill
34
+ ; CHECK-NEXT: str z12 , [sp, #4 , mul vl] // 16-byte Folded Spill
35
+ ; CHECK-NEXT: str z11 , [sp, #5 , mul vl] // 16-byte Folded Spill
36
+ ; CHECK-NEXT: str z10 , [sp, #6 , mul vl] // 16-byte Folded Spill
37
+ ; CHECK-NEXT: str z9 , [sp, #7 , mul vl] // 16-byte Folded Spill
38
+ ; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16 -byte Folded Spill
39
+ ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc8 , 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 72 * VG
40
40
; CHECK-NEXT: .cfi_offset w29, -16
41
41
; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
42
42
; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
43
43
; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
44
44
; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
45
- ; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 40 * VG
46
- ; CHECK-NEXT: ptrue pn8.b
45
+ ; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
46
+ ; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
47
+ ; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
48
+ ; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
47
49
; CHECK-NEXT: lsl x9, x0, #1
50
+ ; CHECK-NEXT: ptrue pn8.b
48
51
; CHECK-NEXT: add x8, x1, x0
49
- ; CHECK-NEXT: ld1w { z0.s - z3 .s }, pn8/z, [x1]
50
- ; CHECK-NEXT: ld1w { z4.s - z7 .s }, pn8/z, [x8]
52
+ ; CHECK-NEXT: ld1w { z0.s, z4.s, z8.s, z12 .s }, pn8/z, [x1]
53
+ ; CHECK-NEXT: ld1w { z1.s, z5.s, z9.s, z13 .s }, pn8/z, [x8]
51
54
; CHECK-NEXT: add x10, x1, x9
52
55
; CHECK-NEXT: add x8, x8, x9
53
- ; CHECK-NEXT: mov z8.d, z2.d
54
56
; CHECK-NEXT: ld1w { z2.s, z6.s, z10.s, z14.s }, pn8/z, [x10]
55
- ; CHECK-NEXT: ld1w { z16.s - z19.s }, pn8/z, [x8]
56
- ; CHECK-NEXT: mov z24.d, z0.d
57
- ; CHECK-NEXT: mov z28.d, z1.d
58
- ; CHECK-NEXT: mov z25.d, z4.d
59
- ; CHECK-NEXT: mov z29.d, z5.d
60
- ; CHECK-NEXT: mov z9.d, z5.d
61
- ; CHECK-NEXT: ptrue pn8.b
62
- ; CHECK-NEXT: mov z26.d, z2.d
63
- ; CHECK-NEXT: mov z30.d, z6.d
64
- ; CHECK-NEXT: mov z27.d, z16.d
65
- ; CHECK-NEXT: mov z31.d, z17.d
66
- ; CHECK-NEXT: mov z11.d, z18.d
67
- ; CHECK-NEXT: mov z16.d, z3.d
68
- ; CHECK-NEXT: mov z17.d, z7.d
69
- ; CHECK-NEXT: mov z18.d, z14.d
70
- ; CHECK-NEXT: fcvt z0.b, { z24.s - z27.s }
71
- ; CHECK-NEXT: fcvt z1.b, { z28.s - z31.s }
72
- ; CHECK-NEXT: fcvt z2.b, { z8.s - z11.s }
73
- ; CHECK-NEXT: fcvt z3.b, { z16.s - z19.s }
74
- ; CHECK-NEXT: ldr z19, [sp, #1, mul vl] // 16-byte Folded Reload
75
- ; CHECK-NEXT: ldr z18, [sp, #2, mul vl] // 16-byte Folded Reload
76
- ; CHECK-NEXT: ldr z17, [sp, #3, mul vl] // 16-byte Folded Reload
77
- ; CHECK-NEXT: ldr z16, [sp, #4, mul vl] // 16-byte Folded Reload
78
- ; CHECK-NEXT: ldr z14, [sp, #5, mul vl] // 16-byte Folded Reload
79
- ; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload
80
- ; CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
57
+ ; CHECK-NEXT: ld1w { z3.s, z7.s, z11.s, z15.s }, pn8/z, [x8]
58
+ ; CHECK-NEXT: mov z24.d, z8.d
59
+ ; CHECK-NEXT: mov z25.d, z5.d
60
+ ; CHECK-NEXT: mov z26.d, z10.d
61
+ ; CHECK-NEXT: mov z27.d, z11.d
62
+ ; CHECK-NEXT: fcvt z0.b, { z0.s - z3.s }
63
+ ; CHECK-NEXT: fcvt z1.b, { z4.s - z7.s }
64
+ ; CHECK-NEXT: fcvt z2.b, { z24.s - z27.s }
65
+ ; CHECK-NEXT: fcvt z3.b, { z12.s - z15.s }
66
+ ; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload
67
+ ; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload
68
+ ; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload
69
+ ; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload
70
+ ; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload
71
+ ; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload
72
+ ; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload
73
+ ; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload
81
74
; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
82
- ; CHECK-NEXT: addvl sp, sp, #10
75
+ ; CHECK-NEXT: addvl sp, sp, #9
83
76
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
84
77
; CHECK-NEXT: ret
85
78
entry:
@@ -144,21 +137,24 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8> } @bfcvt_tuple(i64 %stride, ptr
144
137
; CHECK-LABEL: bfcvt_tuple:
145
138
; CHECK: // %bb.0: // %entry
146
139
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
147
- ; CHECK-NEXT: addvl sp, sp, #-1
140
+ ; CHECK-NEXT: addvl sp, sp, #-3
148
141
; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill
149
- ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
142
+ ; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill
143
+ ; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill
144
+ ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
150
145
; CHECK-NEXT: .cfi_offset w29, -16
146
+ ; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
147
+ ; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
151
148
; CHECK-NEXT: ptrue pn8.b
152
149
; CHECK-NEXT: add x8, x1, x0
153
- ; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x1]
154
- ; CHECK-NEXT: ld1h { z2.h, z3.h }, pn8/z, [x8]
155
- ; CHECK-NEXT: mov z4.d, z0.d
156
- ; CHECK-NEXT: mov z5.d, z2.d
157
- ; CHECK-NEXT: mov z2.d, z1.d
158
- ; CHECK-NEXT: bfcvt z0.b, { z4.h, z5.h }
159
- ; CHECK-NEXT: bfcvt z1.b, { z2.h, z3.h }
150
+ ; CHECK-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x1]
151
+ ; CHECK-NEXT: ld1h { z1.h, z9.h }, pn8/z, [x8]
152
+ ; CHECK-NEXT: bfcvt z0.b, { z0.h, z1.h }
153
+ ; CHECK-NEXT: bfcvt z1.b, { z8.h, z9.h }
154
+ ; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
155
+ ; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
160
156
; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
161
- ; CHECK-NEXT: addvl sp, sp, #1
157
+ ; CHECK-NEXT: addvl sp, sp, #3
162
158
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
163
159
; CHECK-NEXT: ret
164
160
entry:
0 commit comments