@@ -1341,3 +1341,66 @@ entry:
1341
1341
%avg = ashr <vscale x 2 x i64 > %add , splat (i64 1 )
1342
1342
ret <vscale x 2 x i64 > %avg
1343
1343
}
1344
+
1345
+ define void @zext_mload_avgflooru (ptr %p1 , ptr %p2 , <vscale x 8 x i1 > %mask ) {
1346
+ ; SVE-LABEL: zext_mload_avgflooru:
1347
+ ; SVE: // %bb.0:
1348
+ ; SVE-NEXT: ld1b { z0.h }, p0/z, [x0]
1349
+ ; SVE-NEXT: ld1b { z1.h }, p0/z, [x1]
1350
+ ; SVE-NEXT: eor z2.d, z0.d, z1.d
1351
+ ; SVE-NEXT: and z0.d, z0.d, z1.d
1352
+ ; SVE-NEXT: lsr z1.h, z2.h, #1
1353
+ ; SVE-NEXT: add z0.h, z0.h, z1.h
1354
+ ; SVE-NEXT: st1h { z0.h }, p0, [x0]
1355
+ ; SVE-NEXT: ret
1356
+ ;
1357
+ ; SVE2-LABEL: zext_mload_avgflooru:
1358
+ ; SVE2: // %bb.0:
1359
+ ; SVE2-NEXT: ld1b { z0.h }, p0/z, [x0]
1360
+ ; SVE2-NEXT: ld1b { z1.h }, p0/z, [x1]
1361
+ ; SVE2-NEXT: ptrue p1.h
1362
+ ; SVE2-NEXT: uhadd z0.h, p1/m, z0.h, z1.h
1363
+ ; SVE2-NEXT: st1h { z0.h }, p0, [x0]
1364
+ ; SVE2-NEXT: ret
1365
+ %ld1 = call <vscale x 8 x i8 > @llvm.masked.load (ptr %p1 , i32 16 , <vscale x 8 x i1 > %mask , <vscale x 8 x i8 > zeroinitializer )
1366
+ %ld2 = call <vscale x 8 x i8 > @llvm.masked.load (ptr %p2 , i32 16 , <vscale x 8 x i1 > %mask , <vscale x 8 x i8 > zeroinitializer )
1367
+ %and = and <vscale x 8 x i8 > %ld1 , %ld2
1368
+ %xor = xor <vscale x 8 x i8 > %ld1 , %ld2
1369
+ %shift = lshr <vscale x 8 x i8 > %xor , splat(i8 1 )
1370
+ %avg = add <vscale x 8 x i8 > %and , %shift
1371
+ %avgext = zext <vscale x 8 x i8 > %avg to <vscale x 8 x i16 >
1372
+ call void @llvm.masked.store.nxv8i16 (<vscale x 8 x i16 > %avgext , ptr %p1 , i32 16 , <vscale x 8 x i1 > %mask )
1373
+ ret void
1374
+ }
1375
+
1376
+ define void @zext_mload_avgceilu (ptr %p1 , ptr %p2 , <vscale x 8 x i1 > %mask ) {
1377
+ ; SVE-LABEL: zext_mload_avgceilu:
1378
+ ; SVE: // %bb.0:
1379
+ ; SVE-NEXT: ld1b { z0.h }, p0/z, [x0]
1380
+ ; SVE-NEXT: ld1b { z1.h }, p0/z, [x1]
1381
+ ; SVE-NEXT: eor z2.d, z0.d, z1.d
1382
+ ; SVE-NEXT: orr z0.d, z0.d, z1.d
1383
+ ; SVE-NEXT: lsr z1.h, z2.h, #1
1384
+ ; SVE-NEXT: sub z0.h, z0.h, z1.h
1385
+ ; SVE-NEXT: st1b { z0.h }, p0, [x0]
1386
+ ; SVE-NEXT: ret
1387
+ ;
1388
+ ; SVE2-LABEL: zext_mload_avgceilu:
1389
+ ; SVE2: // %bb.0:
1390
+ ; SVE2-NEXT: ld1b { z0.h }, p0/z, [x0]
1391
+ ; SVE2-NEXT: ld1b { z1.h }, p0/z, [x1]
1392
+ ; SVE2-NEXT: ptrue p1.h
1393
+ ; SVE2-NEXT: urhadd z0.h, p1/m, z0.h, z1.h
1394
+ ; SVE2-NEXT: st1b { z0.h }, p0, [x0]
1395
+ ; SVE2-NEXT: ret
1396
+ %ld1 = call <vscale x 8 x i8 > @llvm.masked.load (ptr %p1 , i32 16 , <vscale x 8 x i1 > %mask , <vscale x 8 x i8 > zeroinitializer )
1397
+ %ld2 = call <vscale x 8 x i8 > @llvm.masked.load (ptr %p2 , i32 16 , <vscale x 8 x i1 > %mask , <vscale x 8 x i8 > zeroinitializer )
1398
+ %zext1 = zext <vscale x 8 x i8 > %ld1 to <vscale x 8 x i16 >
1399
+ %zext2 = zext <vscale x 8 x i8 > %ld2 to <vscale x 8 x i16 >
1400
+ %add1 = add nuw nsw <vscale x 8 x i16 > %zext1 , splat(i16 1 )
1401
+ %add2 = add nuw nsw <vscale x 8 x i16 > %add1 , %zext2
1402
+ %shift = lshr <vscale x 8 x i16 > %add2 , splat(i16 1 )
1403
+ %trunc = trunc <vscale x 8 x i16 > %shift to <vscale x 8 x i8 >
1404
+ call void @llvm.masked.store.nxv8i8 (<vscale x 8 x i8 > %trunc , ptr %p1 , i32 16 , <vscale x 8 x i1 > %mask )
1405
+ ret void
1406
+ }
0 commit comments