@@ -887,3 +887,77 @@ entry:
887
887
%partial.reduce = tail call <2 x i64 > @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64 (<2 x i64 > %acc , <8 x i64 > %mult )
888
888
ret <2 x i64 > %partial.reduce
889
889
}
890
+
891
+ define <4 x i32 > @usdot_multiple_zext_users (ptr %p1 , ptr %p2 , ptr %p3 ) {
892
+ ; CHECK-LABEL: usdot_multiple_zext_users:
893
+ ; CHECK: // %bb.0: // %entry
894
+ ; CHECK-NEXT: adrp x8, .LCPI28_0
895
+ ; CHECK-NEXT: movi v0.2d, #0000000000000000
896
+ ; CHECK-NEXT: movi v2.2d, #0000000000000000
897
+ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI28_0]
898
+ ; CHECK-NEXT: adrp x8, .LCPI28_1
899
+ ; CHECK-NEXT: adrp x9, .LCPI28_2
900
+ ; CHECK-NEXT: adrp x10, .LCPI28_3
901
+ ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI28_1]
902
+ ; CHECK-NEXT: ldr q4, [x9, :lo12:.LCPI28_2]
903
+ ; CHECK-NEXT: ldr q5, [x10, :lo12:.LCPI28_3]
904
+ ; CHECK-NEXT: mov x8, xzr
905
+ ; CHECK-NEXT: .LBB28_1: // %vector.body
906
+ ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
907
+ ; CHECK-NEXT: ldr q6, [x2, x8]
908
+ ; CHECK-NEXT: ldr q18, [x0, x8]
909
+ ; CHECK-NEXT: ldr q19, [x1, x8]
910
+ ; CHECK-NEXT: add x8, x8, #16
911
+ ; CHECK-NEXT: tbl v7.16b, { v6.16b }, v1.16b
912
+ ; CHECK-NEXT: tbl v16.16b, { v6.16b }, v3.16b
913
+ ; CHECK-NEXT: tbl v17.16b, { v6.16b }, v4.16b
914
+ ; CHECK-NEXT: tbl v6.16b, { v6.16b }, v5.16b
915
+ ; CHECK-NEXT: cmp x8, #1024
916
+ ; CHECK-NEXT: uzp1 v7.8h, v16.8h, v7.8h
917
+ ; CHECK-NEXT: sshll v16.8h, v18.8b, #0
918
+ ; CHECK-NEXT: uzp1 v6.8h, v6.8h, v17.8h
919
+ ; CHECK-NEXT: sshll2 v17.8h, v18.16b, #0
920
+ ; CHECK-NEXT: sshll v18.8h, v19.8b, #0
921
+ ; CHECK-NEXT: sshll2 v19.8h, v19.16b, #0
922
+ ; CHECK-NEXT: smlal v0.4s, v16.4h, v7.4h
923
+ ; CHECK-NEXT: smlal v2.4s, v18.4h, v7.4h
924
+ ; CHECK-NEXT: smull v20.4s, v17.4h, v6.4h
925
+ ; CHECK-NEXT: smull v21.4s, v19.4h, v6.4h
926
+ ; CHECK-NEXT: smlal2 v0.4s, v17.8h, v6.8h
927
+ ; CHECK-NEXT: smlal2 v2.4s, v19.8h, v6.8h
928
+ ; CHECK-NEXT: smlal2 v20.4s, v16.8h, v7.8h
929
+ ; CHECK-NEXT: smlal2 v21.4s, v18.8h, v7.8h
930
+ ; CHECK-NEXT: add v0.4s, v20.4s, v0.4s
931
+ ; CHECK-NEXT: add v2.4s, v21.4s, v2.4s
932
+ ; CHECK-NEXT: b.ne .LBB28_1
933
+ ; CHECK-NEXT: // %bb.2: // %end
934
+ ; CHECK-NEXT: add v0.4s, v2.4s, v0.4s
935
+ ; CHECK-NEXT: ret
936
+ entry:
937
+ br label %vector.body
938
+
939
+ vector.body:
940
+ %iv = phi i64 [ 0 , %entry ], [ %iv.next , %vector.body ]
941
+ %acc1 = phi <4 x i32 > [ zeroinitializer , %entry ], [ %psum1 , %vector.body ]
942
+ %acc2 = phi <4 x i32 > [ zeroinitializer , %entry ], [ %psum2 , %vector.body ]
943
+ %ptr1 = getelementptr i8 , ptr %p1 , i64 %iv
944
+ %ptr2 = getelementptr i8 , ptr %p2 , i64 %iv
945
+ %ptr3 = getelementptr i8 , ptr %p3 , i64 %iv
946
+ %load1 = load <16 x i8 >, ptr %ptr1
947
+ %load2 = load <16 x i8 >, ptr %ptr2
948
+ %load3 = load <16 x i8 >, ptr %ptr3
949
+ %sext1 = sext <16 x i8 > %load1 to <16 x i32 >
950
+ %zext = zext <16 x i8 > %load3 to <16 x i32 >
951
+ %mul1 = mul <16 x i32 > %sext1 , %zext
952
+ %psum1 = tail call <4 x i32 > @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32 (<4 x i32 > %acc1 , <16 x i32 > %mul1 )
953
+ %sext2 = sext <16 x i8 > %load2 to <16 x i32 >
954
+ %mul2 = mul <16 x i32 > %sext2 , %zext
955
+ %psum2 = tail call <4 x i32 > @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32 (<4 x i32 > %acc2 , <16 x i32 > %mul2 )
956
+ %iv.next = add i64 %iv , 16
957
+ %1 = icmp eq i64 %iv.next , 1024
958
+ br i1 %1 , label %end , label %vector.body
959
+
960
+ end:
961
+ %2 = add <4 x i32 > %psum2 , %psum1
962
+ ret <4 x i32 > %2
963
+ }
0 commit comments