@@ -592,7 +592,41 @@ define dso_local i32 @predicated_test(i32 noundef %0, ptr %glob) #0 {
592
592
ret i32 0
593
593
}
594
594
595
+ ; This has a maximum trip count of 4. The codegen is currently much better with <8 x half> vectorization.
596
+ ; CHECK-LABEL: arm_q15_to_f16_remainder
597
+ ; CHECK: LV: Selecting VF: 8
598
+ define void @arm_q15_to_f16_remainder (ptr nocapture noundef readonly %pSrc , ptr nocapture noundef writeonly noalias %pDst , i32 noundef %blockSize ) #0 {
599
+ entry:
600
+ %rem = and i32 %blockSize , 3
601
+ %cmp.not5 = icmp eq i32 %rem , 0
602
+ br i1 %cmp.not5 , label %while.end , label %while.body.preheader
603
+
604
+ while.body.preheader: ; preds = %entry
605
+ br label %while.body
606
+
607
+ while.body: ; preds = %while.body.preheader, %while.body
608
+ %blkCnt.08 = phi i32 [ %dec , %while.body ], [ %rem , %while.body.preheader ]
609
+ %pIn.07 = phi ptr [ %incdec.ptr , %while.body ], [ %pSrc , %while.body.preheader ]
610
+ %pDst.addr.06 = phi ptr [ %incdec.ptr2 , %while.body ], [ %pDst , %while.body.preheader ]
611
+ %incdec.ptr = getelementptr inbounds i8 , ptr %pIn.07 , i32 2
612
+ %0 = load i16 , ptr %pIn.07 , align 2
613
+ %conv1 = sitofp i16 %0 to half
614
+ %1 = fmul fast half %conv1 , 0xH0200
615
+ %incdec.ptr2 = getelementptr inbounds i8 , ptr %pDst.addr.06 , i32 2
616
+ store half %1 , ptr %pDst.addr.06 , align 2
617
+ %dec = add nsw i32 %blkCnt.08 , -1
618
+ %cmp.not = icmp eq i32 %dec , 0
619
+ br i1 %cmp.not , label %while.end.loopexit , label %while.body
620
+
621
+ while.end.loopexit: ; preds = %while.body
622
+ br label %while.end
623
+
624
+ while.end: ; preds = %while.end.loopexit, %entry
625
+ ret void
626
+ }
627
+
628
+
595
629
declare void @llvm.lifetime.start.p0 (i64 , ptr )
596
630
declare void @llvm.lifetime.end.p0 (i64 , ptr )
597
631
598
- attributes #0 = { "target-features" ="+mve" }
632
+ attributes #0 = { "target-features" ="+mve.fp " }
0 commit comments