[RISCV] Introduce unaligned-vector-mem feature

preames · preames · commit d636bcb6ae51 · 2023-04-28T08:28:08.000-07:00
This allows us to model and thus test transforms which are legal only when a vector load with less than element alignment are supported. This was originally part of D126085, but was split out as we didn't have a good example of such a transform. As can be seen in the test diffs, we have the recently added concat_vector(loads) -> strided_load transform (from D147713) which now benefits from the unaligned support. While making this change, I realized that we actually *do* support unaligned vector loads and stores of all types via conversion to i8 element type. For contiguous loads and stores without masking, we actually already implement this in the backend - though we don't tell the optimizer that. For indexed, lowering to i8 requires complicated addressing. For indexed and segmented, we'd have to use indexed. All around, doesn't seem worthwhile pursuing, but makes for an interesting observation. Differential Revision: https://reviews.llvm.org/D149375
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -717,6 +717,11 @@ def FeatureUnalignedScalarMem
                       "true", "Has reasonably performant unaligned scalar "
                       "loads and stores">;
 
+def FeatureUnalignedVectorMem
+   : SubtargetFeature<"unaligned-vector-mem", "EnableUnalignedVectorMem",
+                      "true", "Has reasonably performant unaligned vector "
+                      "loads and stores">;
+
 def TuneNoOptimizedZeroStrideLoad
    : SubtargetFeature<"no-optimized-zero-stride-load", "HasOptimizedZeroStrideLoad",
                       "false", "Hasn't optimized (perform fewer memory operations)"
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -15636,7 +15636,13 @@ bool RISCVTargetLowering::allowsMisalignedMemoryAccesses(
     return true;
   }
 
-  return false;
+  // Note: We lower an unmasked unaligned vector access to an equally sized
+  // e8 element type access.  Given this, we effectively support all unmasked
+  // misaligned accesses.  TODO: Work through the codegen implications of
+  // allowing such accesses to be formed, and considered fast.
+  if (Fast)
+    *Fast = 0;
+  return Subtarget.enableUnalignedVectorMem();
 }
 
 bool RISCVTargetLowering::splitValueIntoRegisterParts(
@@ -15811,7 +15817,8 @@ bool RISCVTargetLowering::isLegalStridedLoadStore(const DataLayout &DL,
   if (!isLegalElementTypeForRVV(ScalarType))
     return false;
 
-  if (Alignment < DL.getTypeStoreSize(ScalarType).getFixedValue())
+  if (!Subtarget.enableUnalignedVectorMem() &&
+      Alignment < DL.getTypeStoreSize(ScalarType).getFixedValue())
     return false;
 
   return true;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -190,11 +190,12 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
     if (isa<FixedVectorType>(DataType) && !ST->useRVVForFixedLengthVectors())
       return false;
 
-    if (Alignment <
-        DL.getTypeStoreSize(DataType->getScalarType()).getFixedValue())
+    auto *ElemType = DataType->getScalarType();
+    if (!ST->enableUnalignedVectorMem() &&
+        Alignment < DL.getTypeStoreSize(ElemType).getFixedValue())
       return false;
 
-    return TLI->isLegalElementTypeForRVV(DataType->getScalarType());
+    return TLI->isLegalElementTypeForRVV(ElemType);
   }
 
   bool isLegalMaskedLoad(Type *DataType, Align Alignment) {
@@ -212,11 +213,12 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
     if (isa<FixedVectorType>(DataType) && !ST->useRVVForFixedLengthVectors())
       return false;
 
-    if (Alignment <
-        DL.getTypeStoreSize(DataType->getScalarType()).getFixedValue())
+    auto *ElemType = DataType->getScalarType();
+    if (!ST->enableUnalignedVectorMem() &&
+        Alignment < DL.getTypeStoreSize(ElemType).getFixedValue())
       return false;
 
-    return TLI->isLegalElementTypeForRVV(DataType->getScalarType());
+    return TLI->isLegalElementTypeForRVV(ElemType);
   }
 
   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll
@@ -1,7 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64
-; RUN: llc -mtriple=riscv64 -mattr=+f,+zfh,+zve64f,+zvl128b,+experimental-zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,ZVE64F
+; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV64
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh,+unaligned-vector-mem -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64,RV64-MISALIGN
+
+; RUN: llc -mtriple=riscv64 -mattr=+f,+zfh,+zve64f,+zvl128b,+experimental-zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,ZVE64F
 
 ; The two loads are contigous and should be folded into one
 define void @widen_2xv4i16(ptr %x, ptr %z) {
@@ -109,6 +111,46 @@ define void @widen_4xv4i16(ptr %x, ptr %z) {
   ret void
 }
 
+define void @widen_4xv4i16_unaligned(ptr %x, ptr %z) {
+; CHECK-NO-MISALIGN-LABEL: widen_4xv4i16_unaligned:
+; CHECK-NO-MISALIGN:       # %bb.0:
+; CHECK-NO-MISALIGN-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NO-MISALIGN-NEXT:    vle8.v v8, (a0)
+; CHECK-NO-MISALIGN-NEXT:    addi a2, a0, 8
+; CHECK-NO-MISALIGN-NEXT:    vle8.v v10, (a2)
+; CHECK-NO-MISALIGN-NEXT:    addi a2, a0, 16
+; CHECK-NO-MISALIGN-NEXT:    vle8.v v12, (a2)
+; CHECK-NO-MISALIGN-NEXT:    addi a0, a0, 24
+; CHECK-NO-MISALIGN-NEXT:    vle8.v v14, (a0)
+; CHECK-NO-MISALIGN-NEXT:    vsetivli zero, 8, e16, m2, tu, ma
+; CHECK-NO-MISALIGN-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NO-MISALIGN-NEXT:    vsetivli zero, 12, e16, m2, tu, ma
+; CHECK-NO-MISALIGN-NEXT:    vslideup.vi v8, v12, 8
+; CHECK-NO-MISALIGN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NO-MISALIGN-NEXT:    vslideup.vi v8, v14, 12
+; CHECK-NO-MISALIGN-NEXT:    vse16.v v8, (a1)
+; CHECK-NO-MISALIGN-NEXT:    ret
+;
+; RV64-MISALIGN-LABEL: widen_4xv4i16_unaligned:
+; RV64-MISALIGN:       # %bb.0:
+; RV64-MISALIGN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; RV64-MISALIGN-NEXT:    vle16.v v8, (a0)
+; RV64-MISALIGN-NEXT:    vse16.v v8, (a1)
+; RV64-MISALIGN-NEXT:    ret
+  %a = load <4 x i16>, ptr %x, align 1
+  %b.gep = getelementptr i8, ptr %x, i64 8
+  %b = load <4 x i16>, ptr %b.gep, align 1
+  %c.gep = getelementptr i8, ptr %b.gep, i64 8
+  %c = load <4 x i16>, ptr %c.gep, align 1
+  %d.gep = getelementptr i8, ptr %c.gep, i64 8
+  %d = load <4 x i16>, ptr %d.gep, align 1
+  %e.0 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %e.1 = shufflevector <4 x i16> %c, <4 x i16> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %e.2 = shufflevector <8 x i16> %e.0, <8 x i16> %e.1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x i16> %e.2, ptr %z
+  ret void
+}
+
 ; Should be a strided load - with type coercion to i64
 define void @strided_constant(ptr %x, ptr %z) {
 ; CHECK-LABEL: strided_constant:
@@ -365,17 +407,23 @@ define void @strided_runtime_4xv2f32(ptr %x, ptr %z, i64 %s) {
   ret void
 }
 
-; Shouldn't be combined because the resulting load would not be aligned
 define void @strided_unaligned(ptr %x, ptr %z, i64 %s) {
-; CHECK-LABEL: strided_unaligned:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    add a0, a0, a2
-; CHECK-NEXT:    vle8.v v9, (a0)
-; CHECK-NEXT:    vslideup.vi v8, v9, 4
-; CHECK-NEXT:    vse16.v v8, (a1)
-; CHECK-NEXT:    ret
+; CHECK-NO-MISALIGN-LABEL: strided_unaligned:
+; CHECK-NO-MISALIGN:       # %bb.0:
+; CHECK-NO-MISALIGN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NO-MISALIGN-NEXT:    vle8.v v8, (a0)
+; CHECK-NO-MISALIGN-NEXT:    add a0, a0, a2
+; CHECK-NO-MISALIGN-NEXT:    vle8.v v9, (a0)
+; CHECK-NO-MISALIGN-NEXT:    vslideup.vi v8, v9, 4
+; CHECK-NO-MISALIGN-NEXT:    vse16.v v8, (a1)
+; CHECK-NO-MISALIGN-NEXT:    ret
+;
+; RV64-MISALIGN-LABEL: strided_unaligned:
+; RV64-MISALIGN:       # %bb.0:
+; RV64-MISALIGN-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-MISALIGN-NEXT:    vlse64.v v8, (a0), a2
+; RV64-MISALIGN-NEXT:    vse64.v v8, (a1)
+; RV64-MISALIGN-NEXT:    ret
   %a = load <4 x i16>, ptr %x, align 1
   %b.gep = getelementptr i8, ptr %x, i64 %s
   %b = load <4 x i16>, ptr %b.gep, align 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/unaligned-loads-stores.ll b/llvm/test/CodeGen/RISCV/rvv/unaligned-loads-stores.ll