[AArch64] Add missing Neon Types (#126945)

tmatheson-arm · davemgreen · web-flow · commit 832a7bb46061 · 2025-06-02T17:09:35.000+01:00
The AAPCS64 adds a number of vector types to the C unconditionally: https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#11appendix-support-for-advanced-simd-extensions The equivalent SVE types are already available in clang: https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#12appendix-support-for-scalable-vectors __mfp8 is defined in the ACLE https://arm-software.github.io/acle/main/acle.html#data-types --------- Co-authored-by: David Green <david.green@arm.com>
diff --git a/clang/include/clang/Basic/AArch64ACLETypes.def b/clang/include/clang/Basic/AArch64ACLETypes.def
@@ -6,7 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//  This file defines various SVE builtin types.  The macros are:
+//  This file defines various Neon and SVE builtin types.  The macros are:
+//
+//    NEON_VECTOR_TYPE:
+//    - (Name, BaseType, ElBits, NumEls, VectorKind)
+//    Unlike the SVE types, the Neon vector types are not builtin types and
+//    mapped to the equivalent __attribute__(neon_vector_type(...)) vector type.
+//    They are not builtin types.
 //
 //    SVE_TYPE:
 //    - (Name, MangledName, Id, SingletonId)
@@ -57,6 +63,10 @@
 //  - IsBF true for vector of brain float elements.
 //===----------------------------------------------------------------------===//
 
+#ifndef NEON_VECTOR_TYPE
+#define NEON_VECTOR_TYPE(Name, BaseType, ElBits, NumEls, VectorKind)
+#endif
+
 #ifndef SVE_TYPE
 #define SVE_TYPE(Name, Id, SingletonId)
 #endif
@@ -111,7 +121,38 @@
   SVE_TYPE(Name, Id, SingletonId)
 #endif
 
-//===- Vector point types -----------------------------------------------===//
+//===- Neon Vector point types --------------------------------------------===//
+
+NEON_VECTOR_TYPE(__Int8x8_t, CharTy, 8, 8, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Int16x4_t, ShortTy, 16, 4, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Int32x2_t, IntTy, 32, 2, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Uint8x8_t, CharTy, 8, 8, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Uint16x4_t, UnsignedShortTy, 16, 4, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Uint32x2_t, UnsignedIntTy, 32, 2, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Float16x4_t, Float16Ty, 16, 4, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Float32x2_t, FloatTy, 32, 2, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Poly8x8_t, CharTy, 8, 8, VectorKind::NeonPoly)
+NEON_VECTOR_TYPE(__Poly16x4_t, UnsignedShortTy, 16, 4, VectorKind::NeonPoly)
+NEON_VECTOR_TYPE(__Bfloat16x4_t, BFloat16Ty, 16, 4, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Int8x16_t, CharTy, 8, 16, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Int16x8_t, ShortTy, 16, 8, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Int32x4_t, IntTy, 32, 4, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Int64x2_t, LongLongTy, 64, 2, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Uint8x16_t, CharTy, 8, 16, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Uint16x8_t, UnsignedShortTy, 16, 8, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Uint32x4_t, UnsignedIntTy, 32, 4, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Uint64x2_t, UnsignedLongLongTy, 64, 2, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Float16x8_t, Float16Ty, 16, 8, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Float32x4_t, FloatTy, 32, 4, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Float64x2_t, DoubleTy, 64, 2, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Poly8x16_t, CharTy, 8, 16, VectorKind::NeonPoly)
+NEON_VECTOR_TYPE(__Poly16x8_t, UnsignedShortTy, 16, 8, VectorKind::NeonPoly)
+NEON_VECTOR_TYPE(__Poly64x2_t, UnsignedLongLongTy, 64, 2, VectorKind::NeonPoly)
+NEON_VECTOR_TYPE(__Bfloat16x8_t, BFloat16Ty, 16, 8, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Mfloat8x8_t, MFloat8Ty, 8, 8, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Mfloat8x16_t, MFloat8Ty, 8, 16, VectorKind::Neon)
+
+//===- SVE Vector point types ---------------------------------------------===//
 
 SVE_VECTOR_TYPE_INT(__SVInt8_t,  __SVInt8_t,  SveInt8,  SveInt8Ty, 16,  8, 1, true)
 SVE_VECTOR_TYPE_INT(__SVInt16_t, __SVInt16_t, SveInt16, SveInt16Ty, 8, 16, 1, true)
@@ -205,6 +246,7 @@ SVE_OPAQUE_TYPE(__SVCount_t, __SVCount_t, SveCount, SveCountTy)
 
 SVE_SCALAR_TYPE(__mfp8, __mfp8, MFloat8, MFloat8Ty, 8)
 
+#undef NEON_VECTOR_TYPE
 #undef SVE_VECTOR_TYPE
 #undef SVE_VECTOR_TYPE_MFLOAT
 #undef SVE_VECTOR_TYPE_BFLOAT
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
@@ -270,7 +270,7 @@ class TargetInfo : public TransferrableTargetInfo,
   unsigned HasBuiltinMSVaList : 1;
 
   LLVM_PREFERRED_TYPE(bool)
-  unsigned HasAArch64SVETypes : 1;
+  unsigned HasAArch64ACLETypes : 1;
 
   LLVM_PREFERRED_TYPE(bool)
   unsigned HasRISCVVTypes : 1;
@@ -1055,9 +1055,9 @@ class TargetInfo : public TransferrableTargetInfo,
   /// available on this target.
   bool hasBuiltinMSVaList() const { return HasBuiltinMSVaList; }
 
-  /// Returns whether or not the AArch64 SVE built-in types are
+  /// Returns whether or not the AArch64 ACLE built-in types are
   /// available on this target.
-  bool hasAArch64SVETypes() const { return HasAArch64SVETypes; }
+  bool hasAArch64ACLETypes() const { return HasAArch64ACLETypes; }
 
   /// Returns whether or not the RISC-V V built-in types are
   /// available on this target.
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
@@ -1447,10 +1447,10 @@ void ASTContext::InitBuiltinTypes(const TargetInfo &Target,
 #include "clang/Basic/HLSLIntangibleTypes.def"
   }
 
-  if (Target.hasAArch64SVETypes() ||
-      (AuxTarget && AuxTarget->hasAArch64SVETypes())) {
-#define SVE_TYPE(Name, Id, SingletonId) \
-    InitBuiltinType(SingletonId, BuiltinType::Id);
+  if (Target.hasAArch64ACLETypes() ||
+      (AuxTarget && AuxTarget->hasAArch64ACLETypes())) {
+#define SVE_TYPE(Name, Id, SingletonId)                                        \
+  InitBuiltinType(SingletonId, BuiltinType::Id);
 #include "clang/Basic/AArch64ACLETypes.def"
   }
 
@@ -4529,7 +4529,7 @@ QualType ASTContext::getWebAssemblyExternrefType() const {
 /// type.
 QualType ASTContext::getScalableVectorType(QualType EltTy, unsigned NumElts,
                                            unsigned NumFields) const {
-  if (Target->hasAArch64SVETypes()) {
+  if (Target->hasAArch64ACLETypes()) {
     uint64_t EltTySize = getTypeSize(EltTy);
 
 #define SVE_VECTOR_TYPE_INT(Name, MangledName, Id, SingletonId, NumEls,        \
diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
@@ -157,7 +157,7 @@ TargetInfo::TargetInfo(const llvm::Triple &T) : Triple(T) {
   SSERegParmMax = 0;
   HasAlignMac68kSupport = false;
   HasBuiltinMSVaList = false;
-  HasAArch64SVETypes = false;
+  HasAArch64ACLETypes = false;
   HasRISCVVTypes = false;
   AllowAMDGPUUnsafeFPAtomics = false;
   HasUnalignedAccess = false;
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
@@ -239,15 +239,15 @@ AArch64TargetInfo::AArch64TargetInfo(const llvm::Triple &Triple,
   // Make __builtin_ms_va_list available.
   HasBuiltinMSVaList = true;
 
-  // Make the SVE types available.  Note that this deliberately doesn't
-  // depend on SveMode, since in principle it should be possible to turn
+  // Make the Neon ACLE and SVE types available.  Note that this deliberately
+  // doesn't depend on SveMode, since in principle it should be possible to turn
   // SVE on and off within a translation unit.  It should also be possible
   // to compile the global declaration:
   //
   // __SVInt8_t *ptr;
   //
   // even without SVE.
-  HasAArch64SVETypes = true;
+  HasAArch64ACLETypes = true;
 
   // {} in inline assembly are neon specifiers, not assembly variant
   // specifiers.
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
@@ -515,11 +515,14 @@ void Sema::Initialize() {
 #include "clang/Basic/OpenCLExtensionTypes.def"
   }
 
-  if (Context.getTargetInfo().hasAArch64SVETypes() ||
+  if (Context.getTargetInfo().hasAArch64ACLETypes() ||
       (Context.getAuxTargetInfo() &&
-       Context.getAuxTargetInfo()->hasAArch64SVETypes())) {
+       Context.getAuxTargetInfo()->hasAArch64ACLETypes())) {
 #define SVE_TYPE(Name, Id, SingletonId)                                        \
   addImplicitTypedef(#Name, Context.SingletonId);
+#define NEON_VECTOR_TYPE(Name, BaseType, ElBits, NumEls, VectorKind)           \
+  addImplicitTypedef(                                                          \
+      #Name, Context.getVectorType(Context.BaseType, NumEls, VectorKind));
 #include "clang/Basic/AArch64ACLETypes.def"
   }
 
diff --git a/clang/test/AST/ast-dump-aarch64-neon-types.c b/clang/test/AST/ast-dump-aarch64-neon-types.c
@@ -0,0 +1,125 @@
+// Test that NEON types are defined, even when arm_neon.h is not included.
+// as required by AAPCS64 "Support for Advanced SIMD Extensions".
+
+// RUN: %clang_cc1 -ast-dump -triple aarch64-linux-gnu %s -x c | FileCheck %s
+// RUN: %clang_cc1 -ast-dump -triple aarch64-linux-gnu %s -x c++ | FileCheck %s
+// RUN: %clang_cc1 -verify -verify-ignore-unexpected=note -triple x86_64 %s -x c
+// RUN: %clang_cc1 -verify -verify-ignore-unexpected=note -triple x86_64 %s -x c++
+// RUN: %clang_cc1 -verify -verify-ignore-unexpected=note -triple arm-linux-gnu %s -x c
+// RUN: %clang_cc1 -verify -verify-ignore-unexpected=note -triple arm-linux-gnu %s -x c++
+
+__Int8x8_t Int8x8;
+// CHECK: Int8x8 '__Int8x8_t':'__attribute__((neon_vector_type(8))) char'
+// expected-error@-2{{unknown type name '__Int8x8_t'}}
+
+__Int16x4_t Int16x4;
+// CHECK: Int16x4 '__Int16x4_t':'__attribute__((neon_vector_type(4))) short'
+// expected-error@-2{{unknown type name '__Int16x4_t'}}
+
+__Int32x2_t Int32x2;
+// CHECK: Int32x2 '__Int32x2_t':'__attribute__((neon_vector_type(2))) int'
+// expected-error@-2{{unknown type name '__Int32x2_t'}}
+
+__Uint8x8_t Uint8x8;
+// CHECK: Uint8x8 '__Uint8x8_t':'__attribute__((neon_vector_type(8))) char'
+// expected-error@-2{{unknown type name '__Uint8x8_t'}}
+
+__Uint16x4_t Uint16x4;
+// CHECK: Uint16x4 '__Uint16x4_t':'__attribute__((neon_vector_type(4))) unsigned short'
+// expected-error@-2{{unknown type name '__Uint16x4_t'}}
+
+__Uint32x2_t Uint32x2;
+// CHECK: Uint32x2 '__Uint32x2_t':'__attribute__((neon_vector_type(2))) unsigned int'
+// expected-error@-2{{unknown type name '__Uint32x2_t'}}
+
+__Float16x4_t Float16x4;
+// CHECK: Float16x4 '__Float16x4_t':'__attribute__((neon_vector_type(4))) _Float16'
+// expected-error@-2{{unknown type name '__Float16x4_t'}}
+
+__Float32x2_t Float32x2;
+// CHECK: Float32x2 '__Float32x2_t':'__attribute__((neon_vector_type(2))) float'
+// expected-error@-2{{unknown type name '__Float32x2_t'}}
+
+__Poly8x8_t Poly8x8;
+// CHECK: Poly8x8 '__Poly8x8_t':'__attribute__((neon_polyvector_type(8))) char'
+// expected-error@-2{{unknown type name '__Poly8x8_t'}}
+
+__Poly16x4_t Poly16x4;
+// CHECK: Poly16x4 '__Poly16x4_t':'__attribute__((neon_polyvector_type(4))) unsigned short'
+// expected-error@-2{{unknown type name '__Poly16x4_t'}}
+
+__Bfloat16x4_t Bfloat16x4;
+// CHECK: Bfloat16x4 '__Bfloat16x4_t':'__attribute__((neon_vector_type(4))) __bf16'
+// expected-error@-2{{unknown type name '__Bfloat16x4_t'}}
+
+__Int8x16_t Int8x16;
+// CHECK: Int8x16 '__Int8x16_t':'__attribute__((neon_vector_type(16))) char'
+// expected-error@-2{{unknown type name '__Int8x16_t'}}
+
+__Int16x8_t Int16x8;
+// CHECK: Int16x8 '__Int16x8_t':'__attribute__((neon_vector_type(8))) short'
+// expected-error@-2{{unknown type name '__Int16x8_t'}}
+
+__Int32x4_t Int32x4;
+// CHECK: Int32x4 '__Int32x4_t':'__attribute__((neon_vector_type(4))) int'
+// expected-error@-2{{unknown type name '__Int32x4_t'}}
+
+__Int64x2_t Int64x2;
+// CHECK: Int64x2 '__Int64x2_t':'__attribute__((neon_vector_type(2))) long long'
+// expected-error@-2{{unknown type name '__Int64x2_t'}}
+
+__Uint8x16_t Uint8x16;
+// CHECK: Uint8x16 '__Uint8x16_t':'__attribute__((neon_vector_type(16))) char'
+// expected-error@-2{{unknown type name '__Uint8x16_t'}}
+
+__Uint16x8_t Uint16x8;
+// CHECK: Uint16x8 '__Uint16x8_t':'__attribute__((neon_vector_type(8))) unsigned short'
+// expected-error@-2{{unknown type name '__Uint16x8_t'}}
+
+__Uint32x4_t Uint32x4;
+// CHECK: Uint32x4 '__Uint32x4_t':'__attribute__((neon_vector_type(4))) unsigned int'
+// expected-error@-2{{unknown type name '__Uint32x4_t'}}
+
+__Uint64x2_t Uint64x2;
+// CHECK: Uint64x2 '__Uint64x2_t':'__attribute__((neon_vector_type(2))) unsigned long long'
+// expected-error@-2{{unknown type name '__Uint64x2_t'}}
+
+__Float16x8_t Float16x8;
+// CHECK: Float16x8 '__Float16x8_t':'__attribute__((neon_vector_type(8))) _Float16'
+// expected-error@-2{{unknown type name '__Float16x8_t'}}
+
+__Float32x4_t Float32x4;
+// CHECK: Float32x4 '__Float32x4_t':'__attribute__((neon_vector_type(4))) float'
+// expected-error@-2{{unknown type name '__Float32x4_t'}}
+
+__Float64x2_t Float64x2;
+// CHECK: Float64x2 '__Float64x2_t':'__attribute__((neon_vector_type(2))) double'
+// expected-error@-2{{unknown type name '__Float64x2_t'}}
+
+__Poly8x16_t Poly8x16;
+// CHECK: Poly8x16 '__Poly8x16_t':'__attribute__((neon_polyvector_type(16))) char'
+// expected-error@-2{{unknown type name '__Poly8x16_t'}}
+
+__Poly16x8_t Poly16x8;
+// CHECK: Poly16x8 '__Poly16x8_t':'__attribute__((neon_polyvector_type(8))) unsigned short'
+// expected-error@-2{{unknown type name '__Poly16x8_t'}}
+
+__Poly64x2_t Poly64x2;
+// CHECK: Poly64x2 '__Poly64x2_t':'__attribute__((neon_polyvector_type(2))) unsigned long long'
+// expected-error@-2{{unknown type name '__Poly64x2_t'}}
+
+__Bfloat16x8_t Bfloat16x8;
+// CHECK: Bfloat16x8 '__Bfloat16x8_t':'__attribute__((neon_vector_type(8))) __bf16'
+// expected-error@-2{{unknown type name '__Bfloat16x8_t'}}
+
+__mfp8 mfp8;
+// CHECK: mfp8 '__mfp8'
+// expected-error@-2{{unknown type name '__mfp8'}}
+
+__Mfloat8x8_t Mfloat8x8;
+// CHECK: Mfloat8x8 '__Mfloat8x8_t':'__attribute__((neon_vector_type(8))) __mfp8'
+// expected-error@-2{{unknown type name '__Mfloat8x8_t'}}
+
+__Mfloat8x16_t Mfloat8x16;
+// CHECK: Mfloat8x16 '__Mfloat8x16_t':'__attribute__((neon_vector_type(16))) __mfp8'
+// expected-error@-2{{unknown type name '__Mfloat8x16_t'}}
diff --git a/clang/test/CodeGen/AArch64/mixed-neon-types.c b/clang/test/CodeGen/AArch64/mixed-neon-types.c
@@ -0,0 +1,73 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1  -triple aarch64-linux-gnu -target-feature +neon -x c %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-C
+// RUN: %clang_cc1  -triple aarch64-linux-gnu -target-feature +neon -x c++ %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-CPP
+
+typedef __Uint32x4_t X;
+
+// CHECK-C-LABEL: define dso_local <4 x i32> @test(
+// CHECK-C-SAME: <4 x i32> noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <4 x i32>, align 16
+// CHECK-C-NEXT:    store <4 x i32> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    ret <4 x i32> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <4 x i32> @_Z4test12__Uint32x4_t(
+// CHECK-CPP-SAME: <4 x i32> noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <4 x i32>, align 16
+// CHECK-CPP-NEXT:    store <4 x i32> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    ret <4 x i32> [[TMP0]]
+//
+X test(X x) {
+  return x;
+}
+
+#include <arm_neon.h>
+
+// CHECK-C-LABEL: define dso_local <16 x i8> @testboth(
+// CHECK-C-SAME: <4 x i32> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[__P0_ADDR_I:%.*]] = alloca <16 x i8>, align 16
+// CHECK-C-NEXT:    [[__P1_ADDR_I:%.*]] = alloca <16 x i8>, align 16
+// CHECK-C-NEXT:    [[__RET_I:%.*]] = alloca <16 x i8>, align 16
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <4 x i32>, align 16
+// CHECK-C-NEXT:    store <4 x i32> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-C-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+// CHECK-C-NEXT:    store <16 x i8> [[TMP1]], ptr [[__P0_ADDR_I]], align 16
+// CHECK-C-NEXT:    store <16 x i8> [[TMP3]], ptr [[__P1_ADDR_I]], align 16
+// CHECK-C-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr [[__P0_ADDR_I]], align 16
+// CHECK-C-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr [[__P1_ADDR_I]], align 16
+// CHECK-C-NEXT:    [[ADD_I:%.*]] = add <16 x i8> [[TMP4]], [[TMP5]]
+// CHECK-C-NEXT:    store <16 x i8> [[ADD_I]], ptr [[__RET_I]], align 16
+// CHECK-C-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr [[__RET_I]], align 16
+// CHECK-C-NEXT:    ret <16 x i8> [[TMP6]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <16 x i8> @_Z8testboth12__Uint32x4_t(
+// CHECK-CPP-SAME: <4 x i32> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[__P0_ADDR_I:%.*]] = alloca <16 x i8>, align 16
+// CHECK-CPP-NEXT:    [[__P1_ADDR_I:%.*]] = alloca <16 x i8>, align 16
+// CHECK-CPP-NEXT:    [[__RET_I:%.*]] = alloca <16 x i8>, align 16
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <4 x i32>, align 16
+// CHECK-CPP-NEXT:    store <4 x i32> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
+// CHECK-CPP-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
+// CHECK-CPP-NEXT:    store <16 x i8> [[TMP1]], ptr [[__P0_ADDR_I]], align 16
+// CHECK-CPP-NEXT:    store <16 x i8> [[TMP3]], ptr [[__P1_ADDR_I]], align 16
+// CHECK-CPP-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr [[__P0_ADDR_I]], align 16
+// CHECK-CPP-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr [[__P1_ADDR_I]], align 16
+// CHECK-CPP-NEXT:    [[ADD_I:%.*]] = add <16 x i8> [[TMP4]], [[TMP5]]
+// CHECK-CPP-NEXT:    store <16 x i8> [[ADD_I]], ptr [[__RET_I]], align 16
+// CHECK-CPP-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr [[__RET_I]], align 16
+// CHECK-CPP-NEXT:    ret <16 x i8> [[TMP6]]
+//
+int8x16_t testboth(X x) {
+   return vaddq_u8(x, x);
+}