Skip to content

Commit 0c4651f

Browse files
committed
[CostModel][AArch64] Improve cost model for vector reduction intrinsics
OR, XOR and AND entries are added to the cost table. An extra cost is added when vector splitting occurs. This is done to address the issue of a missed SLP vectorization opportunity due to unreasonably high costs being attributed to the vector Or reduction (see: https://bugs.llvm.org/show_bug.cgi?id=44593). Differential Revision: https://reviews.llvm.org/D104538
1 parent e3ea2d7 commit 0c4651f

File tree

7 files changed

+259
-108
lines changed

7 files changed

+259
-108
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 60 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1791,17 +1791,68 @@ AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
17911791
// Horizontal adds can use the 'addv' instruction. We model the cost of these
17921792
// instructions as normal vector adds. This is the only arithmetic vector
17931793
// reduction operation for which we have an instruction.
1794+
// OR, XOR and AND costs should match the codegen from:
1795+
// OR: llvm/test/CodeGen/AArch64/reduce-or.ll
1796+
// XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
1797+
// AND: llvm/test/CodeGen/AArch64/reduce-and.ll
17941798
static const CostTblEntry CostTblNoPairwise[]{
1795-
{ISD::ADD, MVT::v8i8, 1},
1796-
{ISD::ADD, MVT::v16i8, 1},
1797-
{ISD::ADD, MVT::v4i16, 1},
1798-
{ISD::ADD, MVT::v8i16, 1},
1799-
{ISD::ADD, MVT::v4i32, 1},
1799+
{ISD::ADD, MVT::v8i8, 1},
1800+
{ISD::ADD, MVT::v16i8, 1},
1801+
{ISD::ADD, MVT::v4i16, 1},
1802+
{ISD::ADD, MVT::v8i16, 1},
1803+
{ISD::ADD, MVT::v4i32, 1},
1804+
{ISD::OR, MVT::v8i8, 15},
1805+
{ISD::OR, MVT::v16i8, 17},
1806+
{ISD::OR, MVT::v4i16, 7},
1807+
{ISD::OR, MVT::v8i16, 9},
1808+
{ISD::OR, MVT::v2i32, 3},
1809+
{ISD::OR, MVT::v4i32, 5},
1810+
{ISD::OR, MVT::v2i64, 3},
1811+
{ISD::XOR, MVT::v8i8, 15},
1812+
{ISD::XOR, MVT::v16i8, 17},
1813+
{ISD::XOR, MVT::v4i16, 7},
1814+
{ISD::XOR, MVT::v8i16, 9},
1815+
{ISD::XOR, MVT::v2i32, 3},
1816+
{ISD::XOR, MVT::v4i32, 5},
1817+
{ISD::XOR, MVT::v2i64, 3},
1818+
{ISD::AND, MVT::v8i8, 15},
1819+
{ISD::AND, MVT::v16i8, 17},
1820+
{ISD::AND, MVT::v4i16, 7},
1821+
{ISD::AND, MVT::v8i16, 9},
1822+
{ISD::AND, MVT::v2i32, 3},
1823+
{ISD::AND, MVT::v4i32, 5},
1824+
{ISD::AND, MVT::v2i64, 3},
18001825
};
1801-
1802-
if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
1803-
return LT.first * Entry->Cost;
1804-
1826+
switch (ISD) {
1827+
default:
1828+
break;
1829+
case ISD::ADD:
1830+
if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
1831+
return LT.first * Entry->Cost;
1832+
break;
1833+
case ISD::XOR:
1834+
case ISD::AND:
1835+
case ISD::OR:
1836+
const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
1837+
if (!Entry)
1838+
break;
1839+
auto *ValVTy = cast<FixedVectorType>(ValTy);
1840+
if (!ValVTy->getElementType()->isIntegerTy(1) &&
1841+
MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
1842+
isPowerOf2_32(ValVTy->getNumElements())) {
1843+
InstructionCost ExtraCost = 0;
1844+
if (LT.first != 1) {
1845+
// Type needs to be split, so there is an extra cost of LT.first - 1
1846+
// arithmetic ops.
1847+
auto *Ty = FixedVectorType::get(ValTy->getElementType(),
1848+
MTy.getVectorNumElements());
1849+
ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
1850+
ExtraCost *= LT.first - 1;
1851+
}
1852+
return Entry->Cost + ExtraCost;
1853+
}
1854+
break;
1855+
}
18051856
return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
18061857
CostKind);
18071858
}

llvm/test/Analysis/CostModel/AArch64/reduce-and.ll

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,24 @@ define i32 @reduce_i1(i32 %arg) {
1111
; CHECK-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
1212
; CHECK-NEXT: Cost Model: Found an estimated cost of 181 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
1313
; CHECK-NEXT: Cost Model: Found an estimated cost of 362 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
14+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1i8 = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> undef)
15+
; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V3i8 = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> undef)
16+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i8 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
17+
; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8i8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
18+
; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16i8 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
19+
; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32i8 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
20+
; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64i8 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef)
21+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i16 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
22+
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8i16 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
23+
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16i16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
24+
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2i32 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
25+
; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4i32 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
26+
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i32 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
27+
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2i64 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef)
28+
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef)
1429
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
1530
;
31+
1632
%V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef)
1733
%V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef)
1834
%V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef)
@@ -21,6 +37,22 @@ define i32 @reduce_i1(i32 %arg) {
2137
%V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
2238
%V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
2339
%V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
40+
41+
%V1i8 = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> undef)
42+
%V3i8 = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> undef)
43+
%V4i8 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
44+
%V8i8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
45+
%V16i8 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
46+
%V32i8 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef)
47+
%V64i8 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef)
48+
%V4i16 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef)
49+
%V8i16 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef)
50+
%V16i16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef)
51+
%V2i32 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef)
52+
%V4i32 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef)
53+
%V8i32 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef)
54+
%V2i64 = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> undef)
55+
%V4i64 = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> undef)
2456
ret i32 undef
2557
}
2658

@@ -32,3 +64,18 @@ declare i1 @llvm.vector.reduce.and.v16i1(<16 x i1>)
3264
declare i1 @llvm.vector.reduce.and.v32i1(<32 x i1>)
3365
declare i1 @llvm.vector.reduce.and.v64i1(<64 x i1>)
3466
declare i1 @llvm.vector.reduce.and.v128i1(<128 x i1>)
67+
declare i8 @llvm.vector.reduce.and.v1i8(<1 x i8>)
68+
declare i8 @llvm.vector.reduce.and.v3i8(<3 x i8>)
69+
declare i8 @llvm.vector.reduce.and.v4i8(<4 x i8>)
70+
declare i8 @llvm.vector.reduce.and.v8i8(<8 x i8>)
71+
declare i8 @llvm.vector.reduce.and.v16i8(<16 x i8>)
72+
declare i8 @llvm.vector.reduce.and.v32i8(<32 x i8>)
73+
declare i8 @llvm.vector.reduce.and.v64i8(<64 x i8>)
74+
declare i16 @llvm.vector.reduce.and.v4i16(<4 x i16>)
75+
declare i16 @llvm.vector.reduce.and.v8i16(<8 x i16>)
76+
declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>)
77+
declare i32 @llvm.vector.reduce.and.v2i32(<2 x i32>)
78+
declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
79+
declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32>)
80+
declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>)
81+
declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>)

llvm/test/Analysis/CostModel/AArch64/reduce-or.ll

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,24 @@ define i32 @reduce_i1(i32 %arg) {
1111
; CHECK-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
1212
; CHECK-NEXT: Cost Model: Found an estimated cost of 181 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
1313
; CHECK-NEXT: Cost Model: Found an estimated cost of 362 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
14+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1i8 = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> undef)
15+
; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V3i8 = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> undef)
16+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i8 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
17+
; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8i8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
18+
; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16i8 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
19+
; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32i8 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
20+
; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64i8 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef)
21+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i16 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
22+
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8i16 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
23+
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16i16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
24+
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2i32 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
25+
; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4i32 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
26+
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i32 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
27+
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2i64 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef)
28+
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef)
1429
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
1530
;
31+
1632
%V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef)
1733
%V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef)
1834
%V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef)
@@ -21,6 +37,22 @@ define i32 @reduce_i1(i32 %arg) {
2137
%V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
2238
%V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
2339
%V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
40+
41+
%V1i8 = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> undef)
42+
%V3i8 = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> undef)
43+
%V4i8 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
44+
%V8i8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
45+
%V16i8 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
46+
%V32i8 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef)
47+
%V64i8 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef)
48+
%V4i16 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef)
49+
%V8i16 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef)
50+
%V16i16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef)
51+
%V2i32 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef)
52+
%V4i32 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef)
53+
%V8i32 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef)
54+
%V2i64 = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> undef)
55+
%V4i64 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> undef)
2456
ret i32 undef
2557
}
2658

@@ -32,3 +64,18 @@ declare i1 @llvm.vector.reduce.or.v16i1(<16 x i1>)
3264
declare i1 @llvm.vector.reduce.or.v32i1(<32 x i1>)
3365
declare i1 @llvm.vector.reduce.or.v64i1(<64 x i1>)
3466
declare i1 @llvm.vector.reduce.or.v128i1(<128 x i1>)
67+
declare i8 @llvm.vector.reduce.or.v1i8(<1 x i8>)
68+
declare i8 @llvm.vector.reduce.or.v3i8(<3 x i8>)
69+
declare i8 @llvm.vector.reduce.or.v4i8(<4 x i8>)
70+
declare i8 @llvm.vector.reduce.or.v8i8(<8 x i8>)
71+
declare i8 @llvm.vector.reduce.or.v16i8(<16 x i8>)
72+
declare i8 @llvm.vector.reduce.or.v32i8(<32 x i8>)
73+
declare i8 @llvm.vector.reduce.or.v64i8(<64 x i8>)
74+
declare i16 @llvm.vector.reduce.or.v4i16(<4 x i16>)
75+
declare i16 @llvm.vector.reduce.or.v8i16(<8 x i16>)
76+
declare i16 @llvm.vector.reduce.or.v16i16(<16 x i16>)
77+
declare i32 @llvm.vector.reduce.or.v2i32(<2 x i32>)
78+
declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
79+
declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>)
80+
declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>)
81+
declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>)
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
2+
; RUN: opt < %s -mtriple=aarch64-unknown-linux-gnu -cost-model -cost-kind=throughput -analyze | FileCheck %s
3+
4+
define i32 @reduce_i1(i32 %arg) {
5+
; CHECK-LABEL: 'reduce_i1'
6+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
7+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
8+
; CHECK-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
9+
; CHECK-NEXT: Cost Model: Found an estimated cost of 129 for instruction: %V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
10+
; CHECK-NEXT: Cost Model: Found an estimated cost of 364 for instruction: %V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
11+
; CHECK-NEXT: Cost Model: Found an estimated cost of 455 for instruction: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
12+
; CHECK-NEXT: Cost Model: Found an estimated cost of 637 for instruction: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
13+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1001 for instruction: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
14+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1i8 = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> undef)
15+
; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V3i8 = call i8 @llvm.vector.reduce.xor.v3i8(<3 x i8> undef)
16+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i8 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
17+
; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8i8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
18+
; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16i8 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
19+
; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32i8 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
20+
; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64i8 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef)
21+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i16 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
22+
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8i16 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
23+
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16i16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
24+
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2i32 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
25+
; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4i32 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
26+
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i32 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
27+
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2i64 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef)
28+
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef)
29+
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
30+
;
31+
32+
%V1 = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> undef)
33+
%V2 = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> undef)
34+
%V4 = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> undef)
35+
%V8 = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> undef)
36+
%V16 = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> undef)
37+
%V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
38+
%V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
39+
%V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
40+
41+
%V1i8 = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> undef)
42+
%V3i8 = call i8 @llvm.vector.reduce.xor.v3i8(<3 x i8> undef)
43+
%V4i8 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
44+
%V8i8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
45+
%V16i8 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
46+
%V32i8 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef)
47+
%V64i8 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef)
48+
%V4i16 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef)
49+
%V8i16 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef)
50+
%V16i16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef)
51+
%V2i32 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef)
52+
%V4i32 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef)
53+
%V8i32 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef)
54+
%V2i64 = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> undef)
55+
%V4i64 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> undef)
56+
ret i32 undef
57+
}
58+
59+
declare i1 @llvm.vector.reduce.xor.v1i1(<1 x i1>)
60+
declare i1 @llvm.vector.reduce.xor.v2i1(<2 x i1>)
61+
declare i1 @llvm.vector.reduce.xor.v4i1(<4 x i1>)
62+
declare i1 @llvm.vector.reduce.xor.v8i1(<8 x i1>)
63+
declare i1 @llvm.vector.reduce.xor.v16i1(<16 x i1>)
64+
declare i1 @llvm.vector.reduce.xor.v32i1(<32 x i1>)
65+
declare i1 @llvm.vector.reduce.xor.v64i1(<64 x i1>)
66+
declare i1 @llvm.vector.reduce.xor.v128i1(<128 x i1>)
67+
declare i8 @llvm.vector.reduce.xor.v1i8(<1 x i8>)
68+
declare i8 @llvm.vector.reduce.xor.v3i8(<3 x i8>)
69+
declare i8 @llvm.vector.reduce.xor.v4i8(<4 x i8>)
70+
declare i8 @llvm.vector.reduce.xor.v8i8(<8 x i8>)
71+
declare i8 @llvm.vector.reduce.xor.v16i8(<16 x i8>)
72+
declare i8 @llvm.vector.reduce.xor.v32i8(<32 x i8>)
73+
declare i8 @llvm.vector.reduce.xor.v64i8(<64 x i8>)
74+
declare i16 @llvm.vector.reduce.xor.v4i16(<4 x i16>)
75+
declare i16 @llvm.vector.reduce.xor.v8i16(<8 x i16>)
76+
declare i16 @llvm.vector.reduce.xor.v16i16(<16 x i16>)
77+
declare i32 @llvm.vector.reduce.xor.v2i32(<2 x i32>)
78+
declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
79+
declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32>)
80+
declare i64 @llvm.vector.reduce.xor.v2i64(<2 x i64>)
81+
declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>)

0 commit comments

Comments
 (0)