Skip to content

[msan] Add handlers for AVX masked load/store intrinsics #123857

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 26, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 153 additions & 1 deletion llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3046,7 +3046,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
if (maybeHandleSimpleNomemIntrinsic(I))
return true;

// FIXME: detect and handle SSE maskstore/maskload
// FIXME: detect and handle SSE maskstore/maskload?
// Some cases are now handled in handleAVXMasked{Load,Store}.
return false;
}

Expand Down Expand Up @@ -3683,6 +3684,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
// TODO: Store origin.
}

// Intrinsic::masked_store
//
// Note: handleAVXMaskedStore handles AVX/AVX2 variants, though AVX512 masked
// stores are lowered to Intrinsic::masked_store.
void handleMaskedStore(IntrinsicInst &I) {
IRBuilder<> IRB(&I);
Value *V = I.getArgOperand(0);
Expand Down Expand Up @@ -3713,6 +3718,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
std::max(Alignment, kMinOriginAlignment));
}

// Intrinsic::masked_load
//
// Note: handleAVXMaskedLoad handles AVX/AVX2 variants, though AVX512 masked
// loads are lowered to Intrinsic::masked_load.
void handleMaskedLoad(IntrinsicInst &I) {
IRBuilder<> IRB(&I);
Value *Ptr = I.getArgOperand(0);
Expand Down Expand Up @@ -3754,6 +3763,125 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
setOrigin(&I, Origin);
}

// e.g., void @llvm.x86.avx.maskstore.ps.256(ptr, <8 x i32>, <8 x float>)
// dst mask src
//
// AVX512 masked stores are lowered to Intrinsic::masked_load and are handled
// by handleMaskedStore.
//
// This function handles AVX and AVX2 masked stores; these use the MSBs of a
// vector of integers, unlike the LLVM masked intrinsics, which require a
// vector of booleans. X86InstCombineIntrinsic.cpp::simplifyX86MaskedLoad
// mentions that the x86 backend does not know how to efficiently convert
// from a vector of booleans back into the AVX mask format; therefore, they
// (and we) do not reduce AVX/AVX2 masked intrinsics into LLVM masked
// intrinsics.
void handleAVXMaskedStore(IntrinsicInst &I) {
IRBuilder<> IRB(&I);

Value *Dst = I.getArgOperand(0);
assert(Dst->getType()->isPointerTy() && "Destination is not a pointer!");

Value *Mask = I.getArgOperand(1);
assert(isa<VectorType>(Mask->getType()) && "Mask is not a vector!");

Value *Src = I.getArgOperand(2);
assert(isa<VectorType>(Src->getType()) && "Source is not a vector!");

const Align Alignment = Align(1);

Value *SrcShadow = getShadow(Src);

if (ClCheckAccessAddress) {
insertShadowCheck(Dst, &I);
insertShadowCheck(Mask, &I);
}

Value *DstShadowPtr;
Value *DstOriginPtr;
std::tie(DstShadowPtr, DstOriginPtr) = getShadowOriginPtr(
Dst, IRB, SrcShadow->getType(), Alignment, /*isStore*/ true);

SmallVector<Value *, 2> ShadowArgs;
ShadowArgs.append(1, DstShadowPtr);
ShadowArgs.append(1, Mask);
// The intrinsic may require floating-point but shadows can be arbitrary
// bit patterns, of which some would be interpreted as "invalid"
// floating-point values (NaN etc.); we assume the intrinsic will happily
// copy them.
ShadowArgs.append(1, IRB.CreateBitCast(SrcShadow, Src->getType()));

CallInst *CI =
IRB.CreateIntrinsic(IRB.getVoidTy(), I.getIntrinsicID(), ShadowArgs);
setShadow(&I, CI);

if (!MS.TrackOrigins)
return;

// Approximation only
auto &DL = F.getDataLayout();
paintOrigin(IRB, getOrigin(Src), DstOriginPtr,
DL.getTypeStoreSize(SrcShadow->getType()),
std::max(Alignment, kMinOriginAlignment));
}

// e.g., <8 x float> @llvm.x86.avx.maskload.ps.256(ptr, <8 x i32>)
// return src mask
//
// Masked-off values are replaced with 0, which conveniently also represents
// initialized memory.
//
// AVX512 masked stores are lowered to Intrinsic::masked_load and are handled
// by handleMaskedStore.
//
// We do not combine this with handleMaskedLoad; see comment in
// handleAVXMaskedStore for the rationale.
//
// This is subtly different than handleIntrinsicByApplyingToShadow(I, 1)
// because we need to apply getShadowOriginPtr, not getShadow, to the first
// parameter.
void handleAVXMaskedLoad(IntrinsicInst &I) {
IRBuilder<> IRB(&I);

Value *Src = I.getArgOperand(0);
assert(Src->getType()->isPointerTy() && "Source is not a pointer!");

Value *Mask = I.getArgOperand(1);
assert(isa<VectorType>(Mask->getType()) && "Mask is not a vector!");

const Align Alignment = Align(1);

if (ClCheckAccessAddress) {
insertShadowCheck(Mask, &I);
}

Type *SrcShadowTy = getShadowTy(Src);
Value *SrcShadowPtr, *SrcOriginPtr;
std::tie(SrcShadowPtr, SrcOriginPtr) =
getShadowOriginPtr(Src, IRB, SrcShadowTy, Alignment, /*isStore*/ false);

SmallVector<Value *, 2> ShadowArgs;
ShadowArgs.append(1, SrcShadowPtr);
ShadowArgs.append(1, Mask);

CallInst *CI =
IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(), ShadowArgs);
// The intrinsic may require floating-point but shadows can be arbitrary
// bit patterns, of which some would be interpreted as "invalid"
// floating-point values (NaN etc.); we assume the intrinsic will happily
// copy them.
setShadow(&I, IRB.CreateBitCast(CI, getShadowTy(&I)));

if (!MS.TrackOrigins)
return;

// The "pass-through" value is always zero (initialized). To the extent
// that that results in initialized aligned 4-byte chunks, the origin value
// is ignored. It is therefore correct to simply copy the origin from src.
Value *PtrSrcOrigin = IRB.CreateLoad(MS.OriginTy, SrcOriginPtr);
setOrigin(&I, PtrSrcOrigin);
}

// Instrument BMI / BMI2 intrinsics.
// All of these intrinsics are Z = I(X, Y)
// where the types of all operands and the result match, and are either i32 or
Expand Down Expand Up @@ -4466,6 +4594,30 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
break;
}

case Intrinsic::x86_avx_maskstore_ps:
case Intrinsic::x86_avx_maskstore_pd:
case Intrinsic::x86_avx_maskstore_ps_256:
case Intrinsic::x86_avx_maskstore_pd_256:
case Intrinsic::x86_avx2_maskstore_d:
case Intrinsic::x86_avx2_maskstore_q:
case Intrinsic::x86_avx2_maskstore_d_256:
case Intrinsic::x86_avx2_maskstore_q_256: {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are there also AVX512 masked loads/stores that would benefit from this handling?

Copy link
Contributor Author

@thurstond thurstond Jan 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's an intriguing question, thanks!

LLVM automatically "upgrades" avx512.mask.load/store into the @llvm.masked.load/store intrinsic:

  } else if (Name.starts_with("avx512.mask.load")) {
    // "avx512.mask.loadu." or "avx512.mask.load."
    bool Aligned = Name[16] != 'u'; // "avx512.mask.loadu".
    Rep = upgradeMaskedLoad(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
                            CI->getArgOperand(2), Aligned);

(https://github.com/llvm/llvm-project/blob/main/llvm/lib/IR/AutoUpgrade.cpp#L2801)

As a result, MemorySanitizer already handles avx512.mask.load/store via its existing, separate code path for instrumenting @llvm.masked.load/store (handleMaskedLoad/Store). For example, MemorySanitizer transforms:

define void@test_int_x86_avx512_mask_store_d_512(ptr %ptr1, ptr %ptr2, <16 x i32> %x1, i16 %x2) sanitize_memory {
  call void @llvm.x86.avx512.mask.store.d.512(ptr %ptr1, <16 x i32> %x1, i16 %x2)
  ret void
}

declare void @llvm.x86.avx512.mask.store.d.512(ptr, <16 x i32>, i16)

into:

; Function Attrs: sanitize_memory
define void @test_int_x86_avx512_mask_store_d_512(ptr %ptr1, ptr %ptr2, <16 x i32> %x1, i16 %x2) #0 {
  %1 = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
  %2 = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
  %3 = load i64, ptr @__msan_param_tls, align 8
  call void @llvm.donothing()
  %4 = bitcast i16 %1 to <16 x i1>
  %5 = bitcast i16 %x2 to <16 x i1>
  %6 = ptrtoint ptr %ptr1 to i64
  %7 = xor i64 %6, 87960930222080
  %8 = inttoptr i64 %7 to ptr
  call void @llvm.masked.store.v16i32.p0(<16 x i32> %2, ptr %8, i32 64, <16 x i1> %5)
  %_mscmp = icmp ne i64 %3, 0
  %9 = bitcast <16 x i1> %4 to i16
  %_mscmp1 = icmp ne i16 %9, 0
  %_msor = or i1 %_mscmp, %_mscmp1
  br i1 %_msor, label %10, label %11, !prof !1

10:                                               ; preds = %0
  call void @__msan_warning_noreturn() #4
  unreachable

11:                                               ; preds = %0
  call void @llvm.masked.store.v16i32.p0(<16 x i32> %x1, ptr %ptr1, i32 64, <16 x i1> %5)
  ret void
}

(Notice there are two @llvm.masked.store instructions: the first one is the shadow computation, the second one is the original computation.)

The "upgrade" process doesn't happen for AVX/AVX2, because the x86 backend doesn't know how to efficiently convert from the @llvm.masked.load/store format (vector of booleans) back into the AVX masked load/store format (vector of ints) (according to X86InstCombineIntrinsic.cpp::simplifyX86MaskedLoad); hence, MSan needs this patch to add special handling of AVX/AVX2 masked loads/stores.

I'll add a test case in another pull request to keep track of AVX512 :-)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test case: #123980

handleAVXMaskedStore(I);
break;
}

case Intrinsic::x86_avx_maskload_ps:
case Intrinsic::x86_avx_maskload_pd:
case Intrinsic::x86_avx_maskload_ps_256:
case Intrinsic::x86_avx_maskload_pd_256:
case Intrinsic::x86_avx2_maskload_d:
case Intrinsic::x86_avx2_maskload_q:
case Intrinsic::x86_avx2_maskload_d_256:
case Intrinsic::x86_avx2_maskload_q_256: {
handleAVXMaskedLoad(I);
break;
}

case Intrinsic::fshl:
case Intrinsic::fshr:
handleFunnelShift(I);
Expand Down
Loading
Loading