3
3
4
4
@buf = dso_local global [3072 x i8 ] zeroinitializer , align 16
5
5
6
- define dso_local void @test_api (i16 signext %0 , i16 signext %1 ) local_unnamed_addr {
6
+ define dso_local void @test_api (i16 signext %0 , i16 signext %1 ) nounwind {
7
7
; CHECK-LABEL: test_api:
8
8
; CHECK: # %bb.0:
9
9
; CHECK-NEXT: pushq %rbp
10
- ; CHECK-NEXT: .cfi_def_cfa_offset 16
11
10
; CHECK-NEXT: pushq %r15
12
- ; CHECK-NEXT: .cfi_def_cfa_offset 24
13
11
; CHECK-NEXT: pushq %r14
14
- ; CHECK-NEXT: .cfi_def_cfa_offset 32
15
12
; CHECK-NEXT: pushq %rbx
16
- ; CHECK-NEXT: .cfi_def_cfa_offset 40
17
13
; CHECK-NEXT: subq $4056, %rsp # imm = 0xFD8
18
- ; CHECK-NEXT: .cfi_def_cfa_offset 4096
19
- ; CHECK-NEXT: .cfi_offset %rbx, -40
20
- ; CHECK-NEXT: .cfi_offset %r14, -32
21
- ; CHECK-NEXT: .cfi_offset %r15, -24
22
- ; CHECK-NEXT: .cfi_offset %rbp, -16
23
14
; CHECK-NEXT: movl %esi, %ebx
24
15
; CHECK-NEXT: movl %edi, %ebp
25
16
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
@@ -92,15 +83,10 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) local_unnamed_ad
92
83
; CHECK-NEXT: movl $buf+2048, %eax
93
84
; CHECK-NEXT: tilestored %tmm5, (%rax,%rcx)
94
85
; CHECK-NEXT: addq $4056, %rsp # imm = 0xFD8
95
- ; CHECK-NEXT: .cfi_def_cfa_offset 40
96
86
; CHECK-NEXT: popq %rbx
97
- ; CHECK-NEXT: .cfi_def_cfa_offset 32
98
87
; CHECK-NEXT: popq %r14
99
- ; CHECK-NEXT: .cfi_def_cfa_offset 24
100
88
; CHECK-NEXT: popq %r15
101
- ; CHECK-NEXT: .cfi_def_cfa_offset 16
102
89
; CHECK-NEXT: popq %rbp
103
- ; CHECK-NEXT: .cfi_def_cfa_offset 8
104
90
; CHECK-NEXT: tilerelease
105
91
; CHECK-NEXT: retq
106
92
%c = tail call x86_amx @llvm.x86.tileloadd64.internal (i16 %0 , i16 %1 , i8* getelementptr inbounds ([3072 x i8 ], [3072 x i8 ]* @buf , i64 0 , i64 2048 ), i64 32 )
@@ -126,8 +112,93 @@ exit:
126
112
ret void
127
113
}
128
114
129
- declare dso_local void @foo (...) local_unnamed_addr
115
+ define dso_local void @test3 (i8 *%buf ) nounwind {
116
+ ; CHECK-LABEL: test3:
117
+ ; CHECK: # %bb.0: # %entry
118
+ ; CHECK-NEXT: pushq %rbp
119
+ ; CHECK-NEXT: pushq %r15
120
+ ; CHECK-NEXT: pushq %r14
121
+ ; CHECK-NEXT: pushq %rbx
122
+ ; CHECK-NEXT: subq $3032, %rsp # imm = 0xBD8
123
+ ; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
124
+ ; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
125
+ ; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
126
+ ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
127
+ ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
128
+ ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
129
+ ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
130
+ ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
131
+ ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
132
+ ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
133
+ ; CHECK-NEXT: movw $8, %r15w
134
+ ; CHECK-NEXT: tilezero %tmm0
135
+ ; CHECK-NEXT: xorl %eax, %eax
136
+ ; CHECK-NEXT: testb %al, %al
137
+ ; CHECK-NEXT: jne .LBB1_3
138
+ ; CHECK-NEXT: # %bb.1: # %loop.header.preheader
139
+ ; CHECK-NEXT: movq %rdi, %rbx
140
+ ; CHECK-NEXT: movl $32, %r14d
141
+ ; CHECK-NEXT: xorl %ebp, %ebp
142
+ ; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill
143
+ ; CHECK-NEXT: .p2align 4, 0x90
144
+ ; CHECK-NEXT: .LBB1_2: # %loop.header
145
+ ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
146
+ ; CHECK-NEXT: movabsq $64, %rax
147
+ ; CHECK-NEXT: tilestored %tmm0, 1024(%rsp,%rax) # 1024-byte Folded Spill
148
+ ; CHECK-NEXT: tilestored %tmm0, (%rbx,%r14)
149
+ ; CHECK-NEXT: xorl %eax, %eax
150
+ ; CHECK-NEXT: vzeroupper
151
+ ; CHECK-NEXT: callq foo
152
+ ; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
153
+ ; CHECK-NEXT: tilezero %tmm0
154
+ ; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm1
155
+ ; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm2
156
+ ; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
157
+ ; CHECK-NEXT: tilestored %tmm0, (%rbx,%r14)
158
+ ; CHECK-NEXT: movabsq $64, %rax
159
+ ; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm0 # 1024-byte Folded Reload
160
+ ; CHECK-NEXT: incl %ebp
161
+ ; CHECK-NEXT: cmpw $100, %bp
162
+ ; CHECK-NEXT: jl .LBB1_2
163
+ ; CHECK-NEXT: .LBB1_3: # %exit
164
+ ; CHECK-NEXT: addq $3032, %rsp # imm = 0xBD8
165
+ ; CHECK-NEXT: popq %rbx
166
+ ; CHECK-NEXT: popq %r14
167
+ ; CHECK-NEXT: popq %r15
168
+ ; CHECK-NEXT: popq %rbp
169
+ ; CHECK-NEXT: tilerelease
170
+ ; CHECK-NEXT: vzeroupper
171
+ ; CHECK-NEXT: retq
172
+ entry:
173
+ %t5 = tail call x86_amx @llvm.x86.tilezero.internal (i16 8 , i16 8 )
174
+ br i1 undef , label %loop.header , label %exit
175
+
176
+ loop.header:
177
+ %ivphi = phi i16 [0 , %entry ], [%iv , %loop.latch ]
178
+ call void @llvm.x86.tilestored64.internal (i16 8 , i16 8 , i8* %buf , i64 32 , x86_amx %t5 )
179
+ call void (...) @foo ()
180
+ br label %loop.body
181
+
182
+ loop.body:
183
+ %t1 = tail call x86_amx @llvm.x86.tilezero.internal (i16 8 , i16 8 )
184
+ %t2 = tail call x86_amx @llvm.x86.tileloadd64.internal (i16 8 , i16 8 , i8* %buf , i64 32 )
185
+ %t3 = tail call x86_amx @llvm.x86.tileloadd64.internal (i16 8 , i16 8 , i8* %buf , i64 32 )
186
+ %t4 = tail call x86_amx @llvm.x86.tdpbssd.internal (i16 8 , i16 8 , i16 8 , x86_amx %t1 , x86_amx %t2 , x86_amx %t3 )
187
+ tail call void @llvm.x86.tilestored64.internal (i16 8 , i16 8 , i8* %buf , i64 32 , x86_amx %t4 )
188
+ br label %loop.latch
189
+
190
+ loop.latch:
191
+ %iv = add i16 %ivphi , 1
192
+ %c = icmp slt i16 %iv , 100
193
+ br i1 %c , label %loop.header , label %exit
194
+
195
+ exit:
196
+ ret void
197
+ }
198
+
199
+ declare dso_local void @foo (...) nounwind
130
200
201
+ declare x86_amx @llvm.x86.tilezero.internal (i16 , i16 )
131
202
declare x86_amx @llvm.x86.tileloadd64.internal (i16 , i16 , i8* , i64 )
132
203
declare x86_amx @llvm.x86.tdpbssd.internal (i16 , i16 , i16 , x86_amx, x86_amx, x86_amx)
133
204
declare void @llvm.x86.tilestored64.internal (i16 , i16 , i8* , i64 , x86_amx)
0 commit comments