|
1 | 1 | ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
2 |
| -; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT |
3 |
| -; RUN: llc -mtriple aarch64 -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NODOT |
| 2 | +; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-NOI8MM |
| 3 | +; RUN: llc -mtriple aarch64 -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM,CHECK-NODOT |
| 4 | +; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-I8MM |
4 | 5 |
|
5 | 6 | define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
|
6 | 7 | ; CHECK-DOT-LABEL: udot:
|
@@ -102,7 +103,115 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
|
102 | 103 | ret <2 x i32> %partial.reduce
|
103 | 104 | }
|
104 | 105 |
|
105 |
| -define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) { |
| 106 | +define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) { |
| 107 | +; CHECK-NOI8MM-LABEL: usdot: |
| 108 | +; CHECK-NOI8MM: // %bb.0: |
| 109 | +; CHECK-NOI8MM-NEXT: ushll v3.8h, v1.8b, #0 |
| 110 | +; CHECK-NOI8MM-NEXT: ushll2 v1.8h, v1.16b, #0 |
| 111 | +; CHECK-NOI8MM-NEXT: sshll v4.8h, v2.8b, #0 |
| 112 | +; CHECK-NOI8MM-NEXT: sshll2 v2.8h, v2.16b, #0 |
| 113 | +; CHECK-NOI8MM-NEXT: smlal v0.4s, v4.4h, v3.4h |
| 114 | +; CHECK-NOI8MM-NEXT: smull v5.4s, v2.4h, v1.4h |
| 115 | +; CHECK-NOI8MM-NEXT: smlal2 v0.4s, v2.8h, v1.8h |
| 116 | +; CHECK-NOI8MM-NEXT: smlal2 v5.4s, v4.8h, v3.8h |
| 117 | +; CHECK-NOI8MM-NEXT: add v0.4s, v5.4s, v0.4s |
| 118 | +; CHECK-NOI8MM-NEXT: ret |
| 119 | +; |
| 120 | +; CHECK-I8MM-LABEL: usdot: |
| 121 | +; CHECK-I8MM: // %bb.0: |
| 122 | +; CHECK-I8MM-NEXT: usdot v0.4s, v1.16b, v2.16b |
| 123 | +; CHECK-I8MM-NEXT: ret |
| 124 | + %u.wide = zext <16 x i8> %u to <16 x i32> |
| 125 | + %s.wide = sext <16 x i8> %s to <16 x i32> |
| 126 | + %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide |
| 127 | + %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult) |
| 128 | + ret <4 x i32> %partial.reduce |
| 129 | +} |
| 130 | + |
| 131 | +define <2 x i32> @usdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{ |
| 132 | +; CHECK-NOI8MM-LABEL: usdot_narrow: |
| 133 | +; CHECK-NOI8MM: // %bb.0: |
| 134 | +; CHECK-NOI8MM-NEXT: ushll v1.8h, v1.8b, #0 |
| 135 | +; CHECK-NOI8MM-NEXT: sshll v2.8h, v2.8b, #0 |
| 136 | +; CHECK-NOI8MM-NEXT: // kill: def $d0 killed $d0 def $q0 |
| 137 | +; CHECK-NOI8MM-NEXT: smull v3.4s, v2.4h, v1.4h |
| 138 | +; CHECK-NOI8MM-NEXT: smull2 v4.4s, v2.8h, v1.8h |
| 139 | +; CHECK-NOI8MM-NEXT: ext v5.16b, v1.16b, v1.16b, #8 |
| 140 | +; CHECK-NOI8MM-NEXT: ext v6.16b, v2.16b, v2.16b, #8 |
| 141 | +; CHECK-NOI8MM-NEXT: smlal v0.4s, v2.4h, v1.4h |
| 142 | +; CHECK-NOI8MM-NEXT: ext v3.16b, v3.16b, v3.16b, #8 |
| 143 | +; CHECK-NOI8MM-NEXT: ext v1.16b, v4.16b, v4.16b, #8 |
| 144 | +; CHECK-NOI8MM-NEXT: smlal v3.4s, v6.4h, v5.4h |
| 145 | +; CHECK-NOI8MM-NEXT: add v0.2s, v1.2s, v0.2s |
| 146 | +; CHECK-NOI8MM-NEXT: add v0.2s, v3.2s, v0.2s |
| 147 | +; CHECK-NOI8MM-NEXT: ret |
| 148 | +; |
| 149 | +; CHECK-I8MM-LABEL: usdot_narrow: |
| 150 | +; CHECK-I8MM: // %bb.0: |
| 151 | +; CHECK-I8MM-NEXT: usdot v0.2s, v1.8b, v2.8b |
| 152 | +; CHECK-I8MM-NEXT: ret |
| 153 | + %u.wide = zext <8 x i8> %u to <8 x i32> |
| 154 | + %s.wide = sext <8 x i8> %s to <8 x i32> |
| 155 | + %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide |
| 156 | + %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult) |
| 157 | + ret <2 x i32> %partial.reduce |
| 158 | +} |
| 159 | + |
| 160 | +define <4 x i32> @sudot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) #0{ |
| 161 | +; CHECK-NOI8MM-LABEL: sudot: |
| 162 | +; CHECK-NOI8MM: // %bb.0: |
| 163 | +; CHECK-NOI8MM-NEXT: sshll v3.8h, v1.8b, #0 |
| 164 | +; CHECK-NOI8MM-NEXT: sshll2 v1.8h, v1.16b, #0 |
| 165 | +; CHECK-NOI8MM-NEXT: ushll v4.8h, v2.8b, #0 |
| 166 | +; CHECK-NOI8MM-NEXT: ushll2 v2.8h, v2.16b, #0 |
| 167 | +; CHECK-NOI8MM-NEXT: smlal v0.4s, v4.4h, v3.4h |
| 168 | +; CHECK-NOI8MM-NEXT: smull v5.4s, v2.4h, v1.4h |
| 169 | +; CHECK-NOI8MM-NEXT: smlal2 v0.4s, v2.8h, v1.8h |
| 170 | +; CHECK-NOI8MM-NEXT: smlal2 v5.4s, v4.8h, v3.8h |
| 171 | +; CHECK-NOI8MM-NEXT: add v0.4s, v5.4s, v0.4s |
| 172 | +; CHECK-NOI8MM-NEXT: ret |
| 173 | +; |
| 174 | +; CHECK-I8MM-LABEL: sudot: |
| 175 | +; CHECK-I8MM: // %bb.0: |
| 176 | +; CHECK-I8MM-NEXT: usdot v0.4s, v2.16b, v1.16b |
| 177 | +; CHECK-I8MM-NEXT: ret |
| 178 | + %u.wide = sext <16 x i8> %u to <16 x i32> |
| 179 | + %s.wide = zext <16 x i8> %s to <16 x i32> |
| 180 | + %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide |
| 181 | + %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult) |
| 182 | + ret <4 x i32> %partial.reduce |
| 183 | +} |
| 184 | + |
| 185 | +define <2 x i32> @sudot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{ |
| 186 | +; CHECK-NOI8MM-LABEL: sudot_narrow: |
| 187 | +; CHECK-NOI8MM: // %bb.0: |
| 188 | +; CHECK-NOI8MM-NEXT: sshll v1.8h, v1.8b, #0 |
| 189 | +; CHECK-NOI8MM-NEXT: ushll v2.8h, v2.8b, #0 |
| 190 | +; CHECK-NOI8MM-NEXT: // kill: def $d0 killed $d0 def $q0 |
| 191 | +; CHECK-NOI8MM-NEXT: smull v3.4s, v2.4h, v1.4h |
| 192 | +; CHECK-NOI8MM-NEXT: smull2 v4.4s, v2.8h, v1.8h |
| 193 | +; CHECK-NOI8MM-NEXT: ext v5.16b, v1.16b, v1.16b, #8 |
| 194 | +; CHECK-NOI8MM-NEXT: ext v6.16b, v2.16b, v2.16b, #8 |
| 195 | +; CHECK-NOI8MM-NEXT: smlal v0.4s, v2.4h, v1.4h |
| 196 | +; CHECK-NOI8MM-NEXT: ext v3.16b, v3.16b, v3.16b, #8 |
| 197 | +; CHECK-NOI8MM-NEXT: ext v1.16b, v4.16b, v4.16b, #8 |
| 198 | +; CHECK-NOI8MM-NEXT: smlal v3.4s, v6.4h, v5.4h |
| 199 | +; CHECK-NOI8MM-NEXT: add v0.2s, v1.2s, v0.2s |
| 200 | +; CHECK-NOI8MM-NEXT: add v0.2s, v3.2s, v0.2s |
| 201 | +; CHECK-NOI8MM-NEXT: ret |
| 202 | +; |
| 203 | +; CHECK-I8MM-LABEL: sudot_narrow: |
| 204 | +; CHECK-I8MM: // %bb.0: |
| 205 | +; CHECK-I8MM-NEXT: usdot v0.2s, v2.8b, v1.8b |
| 206 | +; CHECK-I8MM-NEXT: ret |
| 207 | + %u.wide = sext <8 x i8> %u to <8 x i32> |
| 208 | + %s.wide = zext <8 x i8> %s to <8 x i32> |
| 209 | + %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide |
| 210 | + %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult) |
| 211 | + ret <2 x i32> %partial.reduce |
| 212 | +} |
| 213 | + |
| 214 | +define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{ |
106 | 215 | ; CHECK-LABEL: not_udot:
|
107 | 216 | ; CHECK: // %bb.0:
|
108 | 217 | ; CHECK-NEXT: umull v1.8h, v2.8b, v1.8b
|
|
0 commit comments