Skip to content

[AArch64] Allow SVE code generation for fixed-width vectors #67122

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Oct 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5619,9 +5619,7 @@ SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
assert(LoadNode && "Expected custom lowering of a masked load node");
EVT VT = Op->getValueType(0);

if (useSVEForFixedLengthVectorVT(
VT,
/*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
return LowerFixedLengthVectorMLoadToSVE(Op, DAG);

SDValue PassThru = LoadNode->getPassThru();
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
return false;

// For fixed vectors, avoid scalarization if using SVE for them.
if (isa<FixedVectorType>(DataType) && !ST->useSVEForFixedLengthVectors())
if (isa<FixedVectorType>(DataType) && !ST->useSVEForFixedLengthVectors() &&
DataType->getPrimitiveSizeInBits() != 128)
return false; // Fall back to scalarization of masked operations.

return isElementTypeLegalForScalableVector(DataType->getScalarType());
Expand Down
90 changes: 90 additions & 0 deletions llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-loads.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mattr=+sve < %s | FileCheck %s


target triple = "aarch64-unknown-linux-gnu"

;
; Masked Load
;

define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) {
; CHECK-LABEL: masked_load_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl16
; CHECK-NEXT: shl v0.16b, v0.16b, #7
; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%load = call <16 x i8> @llvm.masked.load.v16i8(ptr %src, i32 8, <16 x i1> %mask, <16 x i8> zeroinitializer)
ret <16 x i8> %load
}

define <8 x half> @masked_load_v8f16(ptr %src, <8 x i1> %mask) {
; CHECK-LABEL: masked_load_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-NEXT: ptrue p0.h, vl8
; CHECK-NEXT: shl v0.8h, v0.8h, #15
; CHECK-NEXT: cmlt v0.8h, v0.8h, #0
; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%load = call <8 x half> @llvm.masked.load.v8f16(ptr %src, i32 8, <8 x i1> %mask, <8 x half> zeroinitializer)
ret <8 x half> %load
}

define <4 x float> @masked_load_v4f32(ptr %src, <4 x i1> %mask) {
; CHECK-LABEL: masked_load_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: shl v0.4s, v0.4s, #31
; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%load = call <4 x float> @llvm.masked.load.v4f32(ptr %src, i32 8, <4 x i1> %mask, <4 x float> zeroinitializer)
ret <4 x float> %load
}

define <2 x double> @masked_load_v2f64(ptr %src, <2 x i1> %mask) {
; CHECK-LABEL: masked_load_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: shl v0.2d, v0.2d, #63
; CHECK-NEXT: cmlt v0.2d, v0.2d, #0
; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%load = call <2 x double> @llvm.masked.load.v2f64(ptr %src, i32 8, <2 x i1> %mask, <2 x double> zeroinitializer)
ret <2 x double> %load
}

define <2 x double> @masked_load_passthru_v2f64(ptr %src, <2 x i1> %mask, <2 x double> %passthru) {
; CHECK-LABEL: masked_load_passthru_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: shl v0.2d, v0.2d, #63
; CHECK-NEXT: cmlt v0.2d, v0.2d, #0
; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%load = call <2 x double> @llvm.masked.load.v2f64(ptr %src, i32 8, <2 x i1> %mask, <2 x double> %passthru)
ret <2 x double> %load
}

declare <16 x i8> @llvm.masked.load.v16i8(ptr, i32, <16 x i1>, <16 x i8>)
declare <8 x half> @llvm.masked.load.v8f16(ptr, i32, <8 x i1>, <8 x half>)
declare <4 x float> @llvm.masked.load.v4f32(ptr, i32, <4 x i1>, <4 x float>)
declare <2 x double> @llvm.masked.load.v2f64(ptr, i32, <2 x i1>, <2 x double>)
73 changes: 73 additions & 0 deletions llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-stores.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mattr=+sve < %s | FileCheck %s


target triple = "aarch64-unknown-linux-gnu"

;
; Masked Store
;

define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
; CHECK-LABEL: masked_store_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl16
; CHECK-NEXT: shl v0.16b, v0.16b, #7
; CHECK-NEXT: movi v1.2d, #0000000000000000
; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
; CHECK-NEXT: st1b { z1.b }, p0, [x0]
; CHECK-NEXT: ret
call void @llvm.masked.store.v16i8(<16 x i8> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask)
ret void
}

define void @masked_store_v8f16(ptr %dst, <8 x i1> %mask) {
; CHECK-LABEL: masked_store_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-NEXT: ptrue p0.h, vl8
; CHECK-NEXT: movi v1.2d, #0000000000000000
; CHECK-NEXT: shl v0.8h, v0.8h, #15
; CHECK-NEXT: cmlt v0.8h, v0.8h, #0
; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
; CHECK-NEXT: st1h { z1.h }, p0, [x0]
; CHECK-NEXT: ret
call void @llvm.masked.store.v8f16(<8 x half> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask)
ret void
}

define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) {
; CHECK-LABEL: masked_store_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: movi v1.2d, #0000000000000000
; CHECK-NEXT: shl v0.4s, v0.4s, #31
; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
; CHECK-NEXT: st1w { z1.s }, p0, [x0]
; CHECK-NEXT: ret
call void @llvm.masked.store.v4f32(<4 x float> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
ret void
}

define void @masked_store_v2f64(ptr %dst, <2 x i1> %mask) {
; CHECK-LABEL: masked_store_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: movi v1.2d, #0000000000000000
; CHECK-NEXT: shl v0.2d, v0.2d, #63
; CHECK-NEXT: cmlt v0.2d, v0.2d, #0
; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0
; CHECK-NEXT: st1d { z1.d }, p0, [x0]
; CHECK-NEXT: ret
call void @llvm.masked.store.v2f64(<2 x double> zeroinitializer, ptr %dst, i32 8, <2 x i1> %mask)
ret void
}

declare void @llvm.masked.store.v16i8(<16 x i8>, ptr, i32, <16 x i1>)
declare void @llvm.masked.store.v8f16(<8 x half>, ptr, i32, <8 x i1>)
declare void @llvm.masked.store.v4f32(<4 x float>, ptr, i32, <4 x i1>)
declare void @llvm.masked.store.v2f64(<2 x double>, ptr, i32, <2 x i1>)
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ define <2 x float> @masked_load_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
ret <2 x float> %load
}

define <4 x float> @masked_load_v4f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
define <4 x float> @masked_load_v4f32(ptr %ap, ptr %bp) vscale_range(1,0) #0 {
; CHECK-LABEL: masked_load_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl4
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ define void @masked_store_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
ret void
}

define void @masked_store_v4f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
define void @masked_store_v4f32(ptr %ap, ptr %bp) vscale_range(1,0) #0 {
; CHECK-LABEL: masked_store_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl4
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu | FileCheck -check-prefixes=CHECK,CHECK-LE %s
; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck -check-prefixes=CHECK,CHECK-LE %s
; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64_be-linux-gnu -data-layout="E-m:o-i64:64-i128:128-n32:64-S128" | FileCheck -check-prefixes=CHECK,CHECK-BE %s
; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu | FileCheck -check-prefixes=CHECK-LE-COMMON,CHECK-LE %s
; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck -check-prefixes=CHECK-LE-COMMON,CHECK-LE-SVE %s
; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64_be-linux-gnu -data-layout="E-m:o-i64:64-i128:128-n32:64-S128" | FileCheck -check-prefixes=CHECK-BE %s

define <2 x i64> @scalarize_v2i64(ptr %p, <2 x i1> %mask, <2 x i64> %passthru) {
; CHECK-LE-LABEL: @scalarize_v2i64(
Expand All @@ -28,6 +28,10 @@ define <2 x i64> @scalarize_v2i64(ptr %p, <2 x i1> %mask, <2 x i64> %passthru) {
; CHECK-LE-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP10]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
; CHECK-LE-NEXT: ret <2 x i64> [[RES_PHI_ELSE3]]
;
; CHECK-LE-SVE-LABEL: @scalarize_v2i64(
; CHECK-LE-SVE-NEXT: [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[P:%.*]], i32 128, <2 x i1> [[MASK:%.*]], <2 x i64> [[PASSTHRU:%.*]])
; CHECK-LE-SVE-NEXT: ret <2 x i64> [[RET]]
;
; CHECK-BE-LABEL: @scalarize_v2i64(
; CHECK-BE-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
; CHECK-BE-NEXT: [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], -2
Expand Down Expand Up @@ -57,58 +61,83 @@ define <2 x i64> @scalarize_v2i64(ptr %p, <2 x i1> %mask, <2 x i64> %passthru) {
}

define <2 x i64> @scalarize_v2i64_ones_mask(ptr %p, <2 x i64> %passthru) {
; CHECK-LABEL: @scalarize_v2i64_ones_mask(
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[P:%.*]], align 8
; CHECK-NEXT: ret <2 x i64> [[TMP1]]
; CHECK-LE-LABEL: @scalarize_v2i64_ones_mask(
; CHECK-LE-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[P:%.*]], align 8
; CHECK-LE-NEXT: ret <2 x i64> [[TMP1]]
;
; CHECK-LE-SVE-LABEL: @scalarize_v2i64_ones_mask(
; CHECK-LE-SVE-NEXT: [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[P:%.*]], i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> [[PASSTHRU:%.*]])
; CHECK-LE-SVE-NEXT: ret <2 x i64> [[RET]]
;
; CHECK-BE-LABEL: @scalarize_v2i64_ones_mask(
; CHECK-BE-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[P:%.*]], align 8
; CHECK-BE-NEXT: ret <2 x i64> [[TMP1]]
;
%ret = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr %p, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %passthru)
ret <2 x i64> %ret
}

define <2 x i64> @scalarize_v2i64_zero_mask(ptr %p, <2 x i64> %passthru) {
; CHECK-LABEL: @scalarize_v2i64_zero_mask(
; CHECK-NEXT: ret <2 x i64> [[PASSTHRU:%.*]]
; CHECK-LE-LABEL: @scalarize_v2i64_zero_mask(
; CHECK-LE-NEXT: ret <2 x i64> [[PASSTHRU:%.*]]
;
; CHECK-LE-SVE-LABEL: @scalarize_v2i64_zero_mask(
; CHECK-LE-SVE-NEXT: [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[P:%.*]], i32 8, <2 x i1> zeroinitializer, <2 x i64> [[PASSTHRU:%.*]])
; CHECK-LE-SVE-NEXT: ret <2 x i64> [[RET]]
;
; CHECK-BE-LABEL: @scalarize_v2i64_zero_mask(
; CHECK-BE-NEXT: ret <2 x i64> [[PASSTHRU:%.*]]
;
%ret = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr %p, i32 8, <2 x i1> <i1 false, i1 false>, <2 x i64> %passthru)
ret <2 x i64> %ret
}

define <2 x i64> @scalarize_v2i64_const_mask(ptr %p, <2 x i64> %passthru) {
; CHECK-LABEL: @scalarize_v2i64_const_mask(
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP2]], i64 1
; CHECK-NEXT: ret <2 x i64> [[TMP3]]
; CHECK-LE-LABEL: @scalarize_v2i64_const_mask(
; CHECK-LE-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i32 1
; CHECK-LE-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
; CHECK-LE-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP2]], i64 1
; CHECK-LE-NEXT: ret <2 x i64> [[TMP3]]
;
; CHECK-LE-SVE-LABEL: @scalarize_v2i64_const_mask(
; CHECK-LE-SVE-NEXT: [[RET:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[P:%.*]], i32 8, <2 x i1> <i1 false, i1 true>, <2 x i64> [[PASSTHRU:%.*]])
; CHECK-LE-SVE-NEXT: ret <2 x i64> [[RET]]
;
; CHECK-BE-LABEL: @scalarize_v2i64_const_mask(
; CHECK-BE-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i32 1
; CHECK-BE-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
; CHECK-BE-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP2]], i64 1
; CHECK-BE-NEXT: ret <2 x i64> [[TMP3]]
;
%ret = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr %p, i32 8, <2 x i1> <i1 false, i1 true>, <2 x i64> %passthru)
ret <2 x i64> %ret
}

; This use a byte sized but non power of 2 element size. This used to crash due to bad alignment calculation.
define <2 x i24> @scalarize_v2i24(ptr %p, <2 x i1> %mask, <2 x i24> %passthru) {
; CHECK-LE-LABEL: @scalarize_v2i24(
; CHECK-LE-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
; CHECK-LE-NEXT: [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1
; CHECK-LE-NEXT: [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0
; CHECK-LE-NEXT: br i1 [[TMP2]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
; CHECK-LE: cond.load:
; CHECK-LE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i24, ptr [[P:%.*]], i32 0
; CHECK-LE-NEXT: [[TMP4:%.*]] = load i24, ptr [[TMP3]], align 1
; CHECK-LE-NEXT: [[TMP5:%.*]] = insertelement <2 x i24> [[PASSTHRU:%.*]], i24 [[TMP4]], i64 0
; CHECK-LE-NEXT: br label [[ELSE]]
; CHECK-LE: else:
; CHECK-LE-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i24> [ [[TMP5]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
; CHECK-LE-NEXT: [[TMP6:%.*]] = and i2 [[SCALAR_MASK]], -2
; CHECK-LE-NEXT: [[TMP7:%.*]] = icmp ne i2 [[TMP6]], 0
; CHECK-LE-NEXT: br i1 [[TMP7]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
; CHECK-LE: cond.load1:
; CHECK-LE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i24, ptr [[P]], i32 1
; CHECK-LE-NEXT: [[TMP9:%.*]] = load i24, ptr [[TMP8]], align 1
; CHECK-LE-NEXT: [[TMP10:%.*]] = insertelement <2 x i24> [[RES_PHI_ELSE]], i24 [[TMP9]], i64 1
; CHECK-LE-NEXT: br label [[ELSE2]]
; CHECK-LE: else2:
; CHECK-LE-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i24> [ [[TMP10]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
; CHECK-LE-NEXT: ret <2 x i24> [[RES_PHI_ELSE3]]
; CHECK-LE-COMMON-LABEL: @scalarize_v2i24(
; CHECK-LE-COMMON-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
; CHECK-LE-COMMON-NEXT: [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1
; CHECK-LE-COMMON-NEXT: [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0
; CHECK-LE-COMMON-NEXT: br i1 [[TMP2]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
; CHECK-LE-COMMON: cond.load:
; CHECK-LE-COMMON-NEXT: [[TMP3:%.*]] = getelementptr inbounds i24, ptr [[P:%.*]], i32 0
; CHECK-LE-COMMON-NEXT: [[TMP4:%.*]] = load i24, ptr [[TMP3]], align 1
; CHECK-LE-COMMON-NEXT: [[TMP5:%.*]] = insertelement <2 x i24> [[PASSTHRU:%.*]], i24 [[TMP4]], i64 0
; CHECK-LE-COMMON-NEXT: br label [[ELSE]]
; CHECK-LE-COMMON: else:
; CHECK-LE-COMMON-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i24> [ [[TMP5]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
; CHECK-LE-COMMON-NEXT: [[TMP6:%.*]] = and i2 [[SCALAR_MASK]], -2
; CHECK-LE-COMMON-NEXT: [[TMP7:%.*]] = icmp ne i2 [[TMP6]], 0
; CHECK-LE-COMMON-NEXT: br i1 [[TMP7]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
; CHECK-LE-COMMON: cond.load1:
; CHECK-LE-COMMON-NEXT: [[TMP8:%.*]] = getelementptr inbounds i24, ptr [[P]], i32 1
; CHECK-LE-COMMON-NEXT: [[TMP9:%.*]] = load i24, ptr [[TMP8]], align 1
; CHECK-LE-COMMON-NEXT: [[TMP10:%.*]] = insertelement <2 x i24> [[RES_PHI_ELSE]], i24 [[TMP9]], i64 1
; CHECK-LE-COMMON-NEXT: br label [[ELSE2]]
; CHECK-LE-COMMON: else2:
; CHECK-LE-COMMON-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i24> [ [[TMP10]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
; CHECK-LE-COMMON-NEXT: ret <2 x i24> [[RES_PHI_ELSE3]]
;
; CHECK-BE-LABEL: @scalarize_v2i24(
; CHECK-BE-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
Expand Down Expand Up @@ -140,29 +169,29 @@ define <2 x i24> @scalarize_v2i24(ptr %p, <2 x i1> %mask, <2 x i24> %passthru) {

; This use a byte sized but non power of 2 element size. This used to crash due to bad alignment calculation.
define <2 x i48> @scalarize_v2i48(ptr %p, <2 x i1> %mask, <2 x i48> %passthru) {
; CHECK-LE-LABEL: @scalarize_v2i48(
; CHECK-LE-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
; CHECK-LE-NEXT: [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1
; CHECK-LE-NEXT: [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0
; CHECK-LE-NEXT: br i1 [[TMP2]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
; CHECK-LE: cond.load:
; CHECK-LE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i48, ptr [[P:%.*]], i32 0
; CHECK-LE-NEXT: [[TMP4:%.*]] = load i48, ptr [[TMP3]], align 2
; CHECK-LE-NEXT: [[TMP5:%.*]] = insertelement <2 x i48> [[PASSTHRU:%.*]], i48 [[TMP4]], i64 0
; CHECK-LE-NEXT: br label [[ELSE]]
; CHECK-LE: else:
; CHECK-LE-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i48> [ [[TMP5]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
; CHECK-LE-NEXT: [[TMP6:%.*]] = and i2 [[SCALAR_MASK]], -2
; CHECK-LE-NEXT: [[TMP7:%.*]] = icmp ne i2 [[TMP6]], 0
; CHECK-LE-NEXT: br i1 [[TMP7]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
; CHECK-LE: cond.load1:
; CHECK-LE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i48, ptr [[P]], i32 1
; CHECK-LE-NEXT: [[TMP9:%.*]] = load i48, ptr [[TMP8]], align 2
; CHECK-LE-NEXT: [[TMP10:%.*]] = insertelement <2 x i48> [[RES_PHI_ELSE]], i48 [[TMP9]], i64 1
; CHECK-LE-NEXT: br label [[ELSE2]]
; CHECK-LE: else2:
; CHECK-LE-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i48> [ [[TMP10]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
; CHECK-LE-NEXT: ret <2 x i48> [[RES_PHI_ELSE3]]
; CHECK-LE-COMMON-LABEL: @scalarize_v2i48(
; CHECK-LE-COMMON-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
; CHECK-LE-COMMON-NEXT: [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1
; CHECK-LE-COMMON-NEXT: [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0
; CHECK-LE-COMMON-NEXT: br i1 [[TMP2]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]]
; CHECK-LE-COMMON: cond.load:
; CHECK-LE-COMMON-NEXT: [[TMP3:%.*]] = getelementptr inbounds i48, ptr [[P:%.*]], i32 0
; CHECK-LE-COMMON-NEXT: [[TMP4:%.*]] = load i48, ptr [[TMP3]], align 2
; CHECK-LE-COMMON-NEXT: [[TMP5:%.*]] = insertelement <2 x i48> [[PASSTHRU:%.*]], i48 [[TMP4]], i64 0
; CHECK-LE-COMMON-NEXT: br label [[ELSE]]
; CHECK-LE-COMMON: else:
; CHECK-LE-COMMON-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i48> [ [[TMP5]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ]
; CHECK-LE-COMMON-NEXT: [[TMP6:%.*]] = and i2 [[SCALAR_MASK]], -2
; CHECK-LE-COMMON-NEXT: [[TMP7:%.*]] = icmp ne i2 [[TMP6]], 0
; CHECK-LE-COMMON-NEXT: br i1 [[TMP7]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]]
; CHECK-LE-COMMON: cond.load1:
; CHECK-LE-COMMON-NEXT: [[TMP8:%.*]] = getelementptr inbounds i48, ptr [[P]], i32 1
; CHECK-LE-COMMON-NEXT: [[TMP9:%.*]] = load i48, ptr [[TMP8]], align 2
; CHECK-LE-COMMON-NEXT: [[TMP10:%.*]] = insertelement <2 x i48> [[RES_PHI_ELSE]], i48 [[TMP9]], i64 1
; CHECK-LE-COMMON-NEXT: br label [[ELSE2]]
; CHECK-LE-COMMON: else2:
; CHECK-LE-COMMON-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i48> [ [[TMP10]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ]
; CHECK-LE-COMMON-NEXT: ret <2 x i48> [[RES_PHI_ELSE3]]
;
; CHECK-BE-LABEL: @scalarize_v2i48(
; CHECK-BE-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK:%.*]] to i2
Expand Down
Loading