[DAG] replaceStoreOfInsertLoad - don't fold if the inserted element is implicitly truncated

RKSimon · RKSimon · commit ba818c4019c5 · 2023-08-21T11:22:07.000+01:00
D152276 wasn't handling the case where the inserted element is implicitly truncated into the vector - resulting in a i1 element (implicitly truncated from i8) overwriting 8 bits instead of 1 bit. This patch is intended to be merged into 17.x so I've just disallowed any vector element vs inserted element type mismatch - technically we could be more elegant and permit truncated stores (as long as the store is still byte sized), but the use cases for that are so limited I'd prefer to play it safe for now. Candidate patch for llvm#64655 17.x merge Differential Revision: https://reviews.llvm.org/D158366
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -20508,9 +20508,11 @@ SDValue DAGCombiner::replaceStoreOfInsertLoad(StoreSDNode *ST) {
   SDValue Elt = Value.getOperand(1);
   SDValue Idx = Value.getOperand(2);
 
-  // If the element isn't byte sized then we can't compute an offset
+  // If the element isn't byte sized or is implicitly truncated then we can't
+  // compute an offset.
   EVT EltVT = Elt.getValueType();
-  if (!EltVT.isByteSized())
+  if (!EltVT.isByteSized() ||
+      EltVT != Value.getOperand(0).getValueType().getVectorElementType())
     return SDValue();
 
   auto *Ld = dyn_cast<LoadSDNode>(Value.getOperand(0));
diff --git a/llvm/test/CodeGen/X86/pr64655.ll b/llvm/test/CodeGen/X86/pr64655.ll
@@ -41,7 +41,16 @@ define void @f(ptr %0) {
 ;
 ; AVX512-LABEL: f:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    movb $1, 1(%rdi)
+; AVX512-NEXT:    kmovb (%rdi), %k0
+; AVX512-NEXT:    movb $-3, %al
+; AVX512-NEXT:    kmovd %eax, %k1
+; AVX512-NEXT:    kandb %k1, %k0, %k0
+; AVX512-NEXT:    movb $1, %al
+; AVX512-NEXT:    kmovd %eax, %k1
+; AVX512-NEXT:    kshiftlb $7, %k1, %k1
+; AVX512-NEXT:    kshiftrb $6, %k1, %k1
+; AVX512-NEXT:    korb %k1, %k0, %k0
+; AVX512-NEXT:    kmovb %k0, (%rdi)
 ; AVX512-NEXT:    retq
   %2 = load <8 x i1>, ptr %0
   %3 = insertelement <8 x i1> %2, i1 true, i32 1