@@ -1341,3 +1341,72 @@ entry:
1341
1341
%avg = ashr <vscale x 2 x i64 > %add , splat (i64 1 )
1342
1342
ret <vscale x 2 x i64 > %avg
1343
1343
}
1344
+
1345
+ define void @zext_mload_avgflooru (ptr %p1 , ptr %p2 , <vscale x 8 x i1 > %mask ) {
1346
+ ; SVE-LABEL: zext_mload_avgflooru:
1347
+ ; SVE: // %bb.0:
1348
+ ; SVE-NEXT: ld1b { z0.h }, p0/z, [x0]
1349
+ ; SVE-NEXT: ld1b { z1.h }, p0/z, [x1]
1350
+ ; SVE-NEXT: and z0.h, z0.h, #0xff
1351
+ ; SVE-NEXT: and z1.h, z1.h, #0xff
1352
+ ; SVE-NEXT: add z0.h, z0.h, z1.h
1353
+ ; SVE-NEXT: lsr z0.h, z0.h, #1
1354
+ ; SVE-NEXT: st1h { z0.h }, p0, [x0]
1355
+ ; SVE-NEXT: ret
1356
+ ;
1357
+ ; SVE2-LABEL: zext_mload_avgflooru:
1358
+ ; SVE2: // %bb.0:
1359
+ ; SVE2-NEXT: ld1b { z0.h }, p0/z, [x0]
1360
+ ; SVE2-NEXT: ld1b { z1.h }, p0/z, [x1]
1361
+ ; SVE2-NEXT: ptrue p1.h
1362
+ ; SVE2-NEXT: and z0.h, z0.h, #0xff
1363
+ ; SVE2-NEXT: and z1.h, z1.h, #0xff
1364
+ ; SVE2-NEXT: uhadd z0.h, p1/m, z0.h, z1.h
1365
+ ; SVE2-NEXT: st1h { z0.h }, p0, [x0]
1366
+ ; SVE2-NEXT: ret
1367
+ %ld1 = call <vscale x 8 x i8 > @llvm.masked.load (ptr %p1 , i32 16 , <vscale x 8 x i1 > %mask , <vscale x 8 x i8 > zeroinitializer )
1368
+ %ld2 = call <vscale x 8 x i8 > @llvm.masked.load (ptr %p2 , i32 16 , <vscale x 8 x i1 > %mask , <vscale x 8 x i8 > zeroinitializer )
1369
+ %and = and <vscale x 8 x i8 > %ld1 , %ld2
1370
+ %xor = xor <vscale x 8 x i8 > %ld1 , %ld2
1371
+ %shift = lshr <vscale x 8 x i8 > %xor , splat(i8 1 )
1372
+ %avg = add <vscale x 8 x i8 > %and , %shift
1373
+ %avgext = zext <vscale x 8 x i8 > %avg to <vscale x 8 x i16 >
1374
+ call void @llvm.masked.store.nxv8i16 (<vscale x 8 x i16 > %avgext , ptr %p1 , i32 16 , <vscale x 8 x i1 > %mask )
1375
+ ret void
1376
+ }
1377
+
1378
+ define void @zext_mload_avgceilu (ptr %p1 , ptr %p2 , <vscale x 8 x i1 > %mask ) {
1379
+ ; SVE-LABEL: zext_mload_avgceilu:
1380
+ ; SVE: // %bb.0:
1381
+ ; SVE-NEXT: ld1b { z0.h }, p0/z, [x0]
1382
+ ; SVE-NEXT: ld1b { z1.h }, p0/z, [x1]
1383
+ ; SVE-NEXT: mov z2.h, #-1 // =0xffffffffffffffff
1384
+ ; SVE-NEXT: and z0.h, z0.h, #0xff
1385
+ ; SVE-NEXT: and z1.h, z1.h, #0xff
1386
+ ; SVE-NEXT: eor z0.d, z0.d, z2.d
1387
+ ; SVE-NEXT: sub z0.h, z1.h, z0.h
1388
+ ; SVE-NEXT: lsr z0.h, z0.h, #1
1389
+ ; SVE-NEXT: st1b { z0.h }, p0, [x0]
1390
+ ; SVE-NEXT: ret
1391
+ ;
1392
+ ; SVE2-LABEL: zext_mload_avgceilu:
1393
+ ; SVE2: // %bb.0:
1394
+ ; SVE2-NEXT: ld1b { z0.h }, p0/z, [x0]
1395
+ ; SVE2-NEXT: ld1b { z1.h }, p0/z, [x1]
1396
+ ; SVE2-NEXT: ptrue p1.h
1397
+ ; SVE2-NEXT: and z0.h, z0.h, #0xff
1398
+ ; SVE2-NEXT: and z1.h, z1.h, #0xff
1399
+ ; SVE2-NEXT: urhadd z0.h, p1/m, z0.h, z1.h
1400
+ ; SVE2-NEXT: st1b { z0.h }, p0, [x0]
1401
+ ; SVE2-NEXT: ret
1402
+ %ld1 = call <vscale x 8 x i8 > @llvm.masked.load (ptr %p1 , i32 16 , <vscale x 8 x i1 > %mask , <vscale x 8 x i8 > zeroinitializer )
1403
+ %ld2 = call <vscale x 8 x i8 > @llvm.masked.load (ptr %p2 , i32 16 , <vscale x 8 x i1 > %mask , <vscale x 8 x i8 > zeroinitializer )
1404
+ %zext1 = zext <vscale x 8 x i8 > %ld1 to <vscale x 8 x i16 >
1405
+ %zext2 = zext <vscale x 8 x i8 > %ld2 to <vscale x 8 x i16 >
1406
+ %add1 = add nuw nsw <vscale x 8 x i16 > %zext1 , splat(i16 1 )
1407
+ %add2 = add nuw nsw <vscale x 8 x i16 > %add1 , %zext2
1408
+ %shift = lshr <vscale x 8 x i16 > %add2 , splat(i16 1 )
1409
+ %trunc = trunc <vscale x 8 x i16 > %shift to <vscale x 8 x i8 >
1410
+ call void @llvm.masked.store.nxv8i8 (<vscale x 8 x i8 > %trunc , ptr %p1 , i32 16 , <vscale x 8 x i1 > %mask )
1411
+ ret void
1412
+ }
0 commit comments