@@ -1103,6 +1103,339 @@ def UMops2WayOp
1103
1103
}];
1104
1104
}
1105
1105
1106
+ class OuterProduct4Way<string mnemonic,
1107
+ list<Type> allowedInputVectorTypes,
1108
+ list<Type> allowedResultVectorTypes>
1109
+ : OuterProductWideningBase<mnemonic, allowedInputVectorTypes,
1110
+ allowedResultVectorTypes, /*numOuterProducts=*/4>;
1111
+
1112
+ def SMopa4WayOp
1113
+ : OuterProduct4Way<"smopa_4way",
1114
+ [ScalableVectorOfRankAndLengthAndType<[1], [16], [I8]>,
1115
+ ScalableVectorOfRankAndLengthAndType<[1], [8], [I16]>],
1116
+ [nxnxv4i32, nxnxv2i64]> {
1117
+ let summary = "Signed integer sum of 4 outer products and accumulate";
1118
+ let description = [{
1119
+ This operation represents a sum of 4 widened outer products. It takes 2 1-D
1120
+ scalable vectors as input and a 2-D scalable vector (ZA tile) as output.
1121
+
1122
+ For example (i8 to i32):
1123
+
1124
+ ```mlir
1125
+ %result = arm_sme.smopa_4way $lhs, $rhs :
1126
+ vector<[16]xi8>, vector<[16]xi8> into vector<[4]x[4]xi32>
1127
+ ```
1128
+
1129
+ The `lhs` encodes a matrix of shape SVLSx4 and the `rhs` a matrix of
1130
+ 4xSVLS, where SVLS (spec [1], section B2.1) is the number of 32-bit
1131
+ elements in a vector of SVL bits. To illustrate, below is a breakdown of
1132
+ this operation for i8 to i32, SVL=128 (i.e., vscale=1):
1133
+
1134
+ ```
1135
+ LHS
1136
+ [A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A15 A14 A15]
1137
+
1138
+ RHS
1139
+ [B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 B10 B11 B12 B13 B14 B15]
1140
+
1141
+ ----------------------------------------------------------------------------
1142
+
1143
+ implicit layout
1144
+
1145
+ [A0 A1 A2 A3] | [B0 B4 B8 B12]
1146
+ [A4 A5 A6 A7] | [B1 B5 B9 B13]
1147
+ [A8 A9 A10 A11] | [B2 B6 B10 B14]
1148
+ [A12 A13 A14 A15] | [B3 B7 B11 B15]
1149
+
1150
+ ----------------------------------------------------------------------------
1151
+
1152
+ 4 outer products
1153
+
1154
+ Acol0 ⊗ Brow0 | Acol1 ⊗ Brow1
1155
+ ------------- | -------------
1156
+ |
1157
+ [B0 B4 B8 B12] | [B1 B5 B9 B13]
1158
+ |
1159
+ [A0 [ A0B0 A0B4 A0B8 A0B12] | [A1 [ A1B1 A1B5 A1B9 A1B13]
1160
+ A4 [ A4B0 A4B4 A4B8 A4B12] | A5 [ A5B1 A5B5 A5B9 A5B13]
1161
+ A8 [ A8B0 A8B4 A8B8 A8B12] | A9 [ A9B1 A9B5 A9B9 A9B13]
1162
+ A12] [A12B0 A12B4 A12B8 A12B12] | A13] [A13B1 A13B5 A13B9 A13B13]
1163
+ |
1164
+ Acol2 ⊗ Brow2 | Acol3 ⊗ Brow3
1165
+ ------------- | -------------
1166
+ |
1167
+ [B2, B6, B10, B14] | [B3 B7 B11 B15]
1168
+ |
1169
+ [A2 [ A2B2 A2B6 A2B10 A2B14] | [A3 [ A3B3 A3B7 A3B11 A3B15]
1170
+ A6 [ A6B2 A6B6 A6B10 A6B14] | A7 [ A7B3 A7B7 A7B11 A7B15]
1171
+ A10 [A10B2 A10B6 A10B10 A10B14] | A11 [A11B3 A11B7 A11B11 A11B15]
1172
+ A14] [A14B2 A14B6 A14B10 A14B14] | A15] [A15B3 A15B7 A15B11 A15B15]
1173
+ |
1174
+
1175
+ ----------------------------------------------------------------------------
1176
+
1177
+ sum of 4 outer products
1178
+
1179
+ Acol0 ⊗ Brow0 + Acol1 ⊗ Brow1 + Acol2 ⊗ Brow2 + Acol3 ⊗ Brow3
1180
+
1181
+ [ A0B0 + A1B1 + A2B2 + A3B3 ... ... A0B12 + A1B13 + A2B14 + A3B15]
1182
+ [ A4B0 + A5B1 + A6B2 + A7B3 ... ... A4B12 + A5B13 + A6B14 + A7B15]
1183
+ [ A8B0 + A9B1 + A10B2 + A11B3 ... ... A8B12 + A9B13 + A10B14 + A11B15]
1184
+ [A12B0 + A13B1 + A14B2 + A15B3 ... ... A12B12 + A13B13 + A14B14 + A15B15]
1185
+
1186
+ ----------------------------------------------------------------------------
1187
+ ```
1188
+
1189
+ This operation enables the folding of 4 outer products chained via the
1190
+ accumulator into a single outer product.
1191
+
1192
+ For example:
1193
+
1194
+ ```mlir
1195
+ %a0_ext = arith.extsi %a0 : vector<[4]xi8> to vector<[4]xi32>
1196
+ %b0_ext = arith.extsi %b0 : vector<[4]xi8> to vector<[4]xi32>
1197
+
1198
+ %a1_ext = arith.extsi %a1 : vector<[4]xi8> to vector<[4]xi32>
1199
+ %b1_ext = arith.extsi %b1 : vector<[4]xi8> to vector<[4]xi32>
1200
+
1201
+ %a2_ext = arith.extsi %a2 : vector<[4]xi8> to vector<[4]xi32>
1202
+ %b2_ext = arith.extsi %b2 : vector<[4]xi8> to vector<[4]xi32>
1203
+
1204
+ %a3_ext = arith.extsi %a3 : vector<[4]xi8> to vector<[4]xi32>
1205
+ %b3_ext = arith.extsi %b3 : vector<[4]xi8> to vector<[4]xi32>
1206
+
1207
+ %0 = arm_sme.outerproduct %a0_ext, %b0_ext : vector<[4]xi32>, vector<[4]xi32>
1208
+ %1 = arm_sme.outerproduct %a1_ext, %b1_ext acc(%0) : vector<[4]xi32>, vector<[4]xi32>
1209
+ %2 = arm_sme.outerproduct %a2_ext, %b2_ext acc(%1) : vector<[4]xi32>, vector<[4]xi32>
1210
+ %3 = arm_sme.outerproduct %a3_ext, %b3_ext acc(%2) : vector<[4]xi32>, vector<[4]xi32>
1211
+ ```
1212
+
1213
+ The 4 outer products in the example above can be fused into a single outer
1214
+ product as follows:
1215
+
1216
+ ```mlir
1217
+ %lhs0 = "llvm.intr.experimental.vector.interleave2"(%a0, %a2) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
1218
+ %lhs1 = "llvm.intr.experimental.vector.interleave2"(%a1, %a3) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
1219
+ %lhs = "llvm.intr.experimental.vector.interleave2"(%lhs0, %lhs1) : (vector<[8]xi8>, vector<[8]xi8>) -> vector<[16]xi8>
1220
+
1221
+ %rhs0 = "llvm.intr.experimental.vector.interleave2"(%b0, %b2) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
1222
+ %rhs1 = "llvm.intr.experimental.vector.interleave2"(%b1, %b3) : (vector<[4]xi8>, vector<[4]xi8>) -> vector<[8]xi8>
1223
+ %rhs = "llvm.intr.experimental.vector.interleave2"(%rhs0, %rhs1) : (vector<[8]xi8>, vector<[8]xi8>) -> vector<[16]xi8>
1224
+
1225
+ %0 = arm_sme.smopa_4way %lhs, %rhs : vector<[16]xi8>, vector<[16]xi8> into vector<[4]x[4]xi32>
1226
+ ```
1227
+
1228
+ This is implemented in the `-arm-sme-outer-product-fusion` pass.
1229
+
1230
+ Example: I8 to I32
1231
+ ```mlir
1232
+ %result = arm_sme.smopa_4way $lhs, $rhs : vector<[16]xi8>, vector<[16]xi8> into vector<[4]x[4]xi32>
1233
+ ```
1234
+
1235
+ Example: I16 to I64
1236
+ ```mlir
1237
+ %result = arm_sme.smopa_4way $lhs, $rhs : vector<[8]xi16>, vector<[8]xi16> into vector<[2]x[2]xi64>
1238
+
1239
+ | Spec | Features |
1240
+ | ---- | -------- |
1241
+ | [SMOPA (4-way)](https://developer.arm.com/documentation/ddi0602/2023-09/SME-Instructions/SMOPA--4-way---Signed-integer-sum-of-outer-products-and-accumulate-) | +sme (32-bit), +sme-i16i64 (64-bit)|
1242
+
1243
+ ```
1244
+ }];
1245
+ }
1246
+
1247
+ def SMops4WayOp
1248
+ : OuterProduct4Way<"smops_4way",
1249
+ [ScalableVectorOfRankAndLengthAndType<[1], [16], [I8]>,
1250
+ ScalableVectorOfRankAndLengthAndType<[1], [8], [I16]>],
1251
+ [nxnxv4i32, nxnxv2i64]> {
1252
+ let summary = "Signed integer sum of 4 outer products and subtract";
1253
+ let description = [{
1254
+ Equivalent to `smopa_4way` but outer products are subtracted from
1255
+ destination `result`.
1256
+
1257
+ Example: I8 to I32
1258
+ ```mlir
1259
+ %result = arm_sme.smops_4way $lhs, $rhs : vector<[16]xi8>, vector<[16]xi8> into vector<[4]x[4]xi32>
1260
+ ```
1261
+
1262
+ Example: I16 to I64
1263
+ ```mlir
1264
+ %result = arm_sme.smops_4way $lhs, $rhs : vector<[8]xi16>, vector<[8]xi16> into vector<[2]x[2]xi64>
1265
+
1266
+ Refer to [smopa_4way](#arm_smesmopa_4way-arm_smesmopa_4wayop) for a
1267
+ detailed description of 4-way outer products.
1268
+
1269
+ | Spec | Features |
1270
+ | ---- | -------- |
1271
+ | [SMOPS (4-way)](https://developer.arm.com/documentation/ddi0602/2023-09/SME-Instructions/SMOPS--4-way---Signed-integer-sum-of-outer-products-and-subtract-) | +sme (32-bit), +sme-i16i64 (64-bit)|
1272
+
1273
+ ```
1274
+ }];
1275
+ }
1276
+
1277
+ def UMopa4WayOp
1278
+ : OuterProduct4Way<"umopa_4way",
1279
+ [ScalableVectorOfRankAndLengthAndType<[1], [16], [I8]>,
1280
+ ScalableVectorOfRankAndLengthAndType<[1], [8], [I16]>],
1281
+ [nxnxv4i32, nxnxv2i64]> {
1282
+ let summary = "Unsigned integer sum of 4 outer products and accumulate";
1283
+ let description = [{
1284
+ Example: I8 to I32
1285
+ ```mlir
1286
+ %result = arm_sme.umopa_4way $lhs, $rhs : vector<[16]xi8>, vector<[16]xi8> into vector<[4]x[4]xi32>
1287
+ ```
1288
+
1289
+ Example: I16 to I64
1290
+ ```mlir
1291
+ %result = arm_sme.umopa_4way $lhs, $rhs : vector<[8]xi16>, vector<[8]xi16> into vector<[2]x[2]xi64>
1292
+
1293
+ Refer to [smopa_4way](#arm_smesmopa_4way-arm_smesmopa_4wayop) for a
1294
+ detailed description of 4-way outer products.
1295
+
1296
+ | Spec | Features |
1297
+ | ---- | -------- |
1298
+ | [UMOPA (4-way)](https://developer.arm.com/documentation/ddi0602/2023-09/SME-Instructions/UMOPA--4-way---Unsigned-integer-sum-of-outer-products-and-accumulate-) | +sme (32-bit), +sme-i16i64 (64-bit)|
1299
+
1300
+ ```
1301
+ }];
1302
+ }
1303
+
1304
+ def UMops4WayOp
1305
+ : OuterProduct4Way<"umops_4way",
1306
+ [ScalableVectorOfRankAndLengthAndType<[1], [16], [I8]>,
1307
+ ScalableVectorOfRankAndLengthAndType<[1], [8], [I16]>],
1308
+ [nxnxv4i32, nxnxv2i64]> {
1309
+ let summary = "Unsigned integer sum of 4 outer products and subtract";
1310
+ let description = [{
1311
+ Example: I8 to I32
1312
+ ```mlir
1313
+ %result = arm_sme.umops_4way $lhs, $rhs : vector<[16]xi8>, vector<[16]xi8> into vector<[4]x[4]xi32>
1314
+ ```
1315
+
1316
+ Example: I16 to I64
1317
+ ```mlir
1318
+ %result = arm_sme.umops_4way $lhs, $rhs : vector<[8]xi16>, vector<[8]xi16> into vector<[2]x[2]xi64>
1319
+
1320
+ Refer to [smopa_4way](#arm_smesmopa_4way-arm_smesmopa_4wayop) for a
1321
+ detailed description of 4-way outer products.
1322
+
1323
+ | Spec | Features |
1324
+ | ---- | -------- |
1325
+ | [UMOPS (4-way)](https://developer.arm.com/documentation/ddi0602/2023-09/SME-Instructions/UMOPS--4-way---Unsigned-integer-sum-of-outer-products-and-subtract-) | +sme (32-bit), +sme-i16i64 (64-bit)|
1326
+
1327
+ ```
1328
+ }];
1329
+ }
1330
+
1331
+ def SuMopa4WayOp
1332
+ : OuterProduct4Way<"sumopa_4way",
1333
+ [ScalableVectorOfRankAndLengthAndType<[1], [16], [I8]>,
1334
+ ScalableVectorOfRankAndLengthAndType<[1], [8], [I16]>],
1335
+ [nxnxv4i32, nxnxv2i64]> {
1336
+ let summary = "Signed by unsigned integer sum of 4 outer products and accumulate";
1337
+ let description = [{
1338
+ Example: I8 to I32
1339
+ ```mlir
1340
+ %result = arm_sme.sumopa_4way $lhs, $rhs : vector<[16]xi8>, vector<[16]xi8> into vector<[4]x[4]xi32>
1341
+ ```
1342
+
1343
+ Example: I16 to I64
1344
+ ```mlir
1345
+ %result = arm_sme.sumopa_4way $lhs, $rhs : vector<[8]xi16>, vector<[8]xi16> into vector<[2]x[2]xi64>
1346
+
1347
+ Refer to [smopa_4way](#arm_smesmopa_4way-arm_smesmopa_4wayop) for a
1348
+ detailed description of 4-way outer products.
1349
+
1350
+ | Spec | Features |
1351
+ | ---- | -------- |
1352
+ | [SUMOPA (4-way)](https://developer.arm.com/documentation/ddi0602/2023-09/SME-Instructions/SUMOPA--Signed-by-unsigned-integer-sum-of-outer-products-and-accumulate-) | +sme (32-bit), +sme-i16i64 (64-bit)|
1353
+
1354
+ ```
1355
+ }];
1356
+ }
1357
+
1358
+ def SuMops4WayOp
1359
+ : OuterProduct4Way<"sumops_4way",
1360
+ [ScalableVectorOfRankAndLengthAndType<[1], [16], [I8]>,
1361
+ ScalableVectorOfRankAndLengthAndType<[1], [8], [I16]>],
1362
+ [nxnxv4i32, nxnxv2i64]> {
1363
+ let summary = "Signed by unsigned integer sum of 4 outer products and subtract";
1364
+ let description = [{
1365
+ Example: I8 to I32
1366
+ ```mlir
1367
+ %result = arm_sme.sumops_4way $lhs, $rhs : vector<[16]xi8>, vector<[16]xi8> into vector<[4]x[4]xi32>
1368
+ ```
1369
+
1370
+ Example: I16 to I64
1371
+ ```mlir
1372
+ %result = arm_sme.sumops_4way $lhs, $rhs : vector<[8]xi16>, vector<[8]xi16> into vector<[2]x[2]xi64>
1373
+
1374
+ Refer to [smopa_4way](#arm_smesmopa_4way-arm_smesmopa_4wayop) for a
1375
+ detailed description of 4-way outer products.
1376
+
1377
+ | Spec | Features |
1378
+ | ---- | -------- |
1379
+ | [SUMOPS (4-way)](https://developer.arm.com/documentation/ddi0602/2023-09/SME-Instructions/SUMOPS--Signed-by-unsigned-integer-sum-of-outer-products-and-subtract-) | +sme (32-bit), +sme-i16i64 (64-bit)|
1380
+
1381
+ ```
1382
+ }];
1383
+ }
1384
+
1385
+ def UsMopa4WayOp
1386
+ : OuterProduct4Way<"usmopa_4way",
1387
+ [ScalableVectorOfRankAndLengthAndType<[1], [16], [I8]>,
1388
+ ScalableVectorOfRankAndLengthAndType<[1], [8], [I16]>],
1389
+ [nxnxv4i32, nxnxv2i64]> {
1390
+ let summary = "Unsigned by signed integer sum of 4 outer products and accumulate";
1391
+ let description = [{
1392
+ Example: I8 to I32
1393
+ ```mlir
1394
+ %result = arm_sme.usmopa_4way $lhs, $rhs : vector<[16]xi8>, vector<[16]xi8> into vector<[4]x[4]xi32>
1395
+ ```
1396
+
1397
+ Example: I16 to I64
1398
+ ```mlir
1399
+ %result = arm_sme.usmopa_4way $lhs, $rhs : vector<[8]xi16>, vector<[8]xi16> into vector<[2]x[2]xi64>
1400
+
1401
+ Refer to [smopa_4way](#arm_smesmopa_4way-arm_smesmopa_4wayop) for a
1402
+ detailed description of 4-way outer products.
1403
+
1404
+ | Spec | Features |
1405
+ | ---- | -------- |
1406
+ | [USMOPA (4-way)](https://developer.arm.com/documentation/ddi0602/2023-09/SME-Instructions/USMOPA--Unsigned-by-signed-integer-sum-of-outer-products-and-accumulate-) | +sme (32-bit), +sme-i16i64 (64-bit)|
1407
+
1408
+ ```
1409
+ }];
1410
+ }
1411
+
1412
+ def UsMops4WayOp
1413
+ : OuterProduct4Way<"usmops_4way",
1414
+ [ScalableVectorOfRankAndLengthAndType<[1], [16], [I8]>,
1415
+ ScalableVectorOfRankAndLengthAndType<[1], [8], [I16]>],
1416
+ [nxnxv4i32, nxnxv2i64]> {
1417
+ let summary = "Unsigned by signed integer sum of 4 outer products and subtract";
1418
+ let description = [{
1419
+ Example: I8 to I32
1420
+ ```mlir
1421
+ %result = arm_sme.usmops_4way $lhs, $rhs : vector<[16]xi8>, vector<[16]xi8> into vector<[4]x[4]xi32>
1422
+ ```
1423
+
1424
+ Example: I16 to I64
1425
+ ```mlir
1426
+ %result = arm_sme.usmops_4way $lhs, $rhs : vector<[8]xi16>, vector<[8]xi16> into vector<[2]x[2]xi64>
1427
+
1428
+ Refer to [smopa_4way](#arm_smesmopa_4way-arm_smesmopa_4wayop) for a
1429
+ detailed description of 4-way outer products.
1430
+
1431
+ | Spec | Features |
1432
+ | ---- | -------- |
1433
+ | [USMOPS (4-way)](https://developer.arm.com/documentation/ddi0602/2023-09/SME-Instructions/USMOPS--Unsigned-by-signed-integer-sum-of-outer-products-and-subtract-) | +sme (32-bit), +sme-i16i64 (64-bit)|
1434
+
1435
+ ```
1436
+ }];
1437
+ }
1438
+
1106
1439
def StreamingVLOp : ArmSME_Op<"streaming_vl", [Pure]>
1107
1440
{
1108
1441
let summary = "Query the streaming vector length";
0 commit comments