Merged main:96ef623a7525 into amd-gfx:cf4b971d071e

SC llvm team · SC llvm team · commit cf1a77e69a41 · 2023-11-13T11:02:28.000-05:00
Local branch amd-gfx cf4b971 Merged main:0a0e06f29145 into amd-gfx:481034665eb6 Remote branch main 96ef623 [AArch64] Cast predicate operand of SVE gather loads/scater stores to the parameter type of the intrinsic (NFC) (llvm#71289)
diff --git a/clang-tools-extra/include-cleaner/lib/WalkAST.cpp b/clang-tools-extra/include-cleaner/lib/WalkAST.cpp
@@ -11,6 +11,7 @@
 #include "clang/AST/ASTFwd.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclCXX.h"
+#include "clang/AST/DeclFriend.h"
 #include "clang/AST/DeclTemplate.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/ExprCXX.h"
@@ -243,6 +244,14 @@ class ASTWalker : public RecursiveASTVisitor<ASTWalker> {
     return true;
   }
 
+  bool VisitFriendDecl(FriendDecl *D) {
+    // We already visit the TypeLoc properly, but need to special case the decl
+    // case.
+    if (auto *FD = D->getFriendDecl())
+      report(D->getLocation(), FD);
+    return true;
+  }
+
   bool VisitConceptReference(const ConceptReference *CR) {
     report(CR->getConceptNameLoc(), CR->getFoundDecl());
     return true;
diff --git a/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp b/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp
@@ -550,5 +550,10 @@ TEST(WalkAST, Concepts) {
   // FIXME: Foo should be explicitly referenced.
   testWalk("template<typename T> concept Foo = true;", "void func() { ^Foo auto x = 1; }");
 }
+
+TEST(WalkAST, FriendDecl) {
+  testWalk("void $explicit^foo();", "struct Bar { friend void ^foo(); };");
+  testWalk("struct $explicit^Foo {};", "struct Bar { friend struct ^Foo; };");
+}
 } // namespace
 } // namespace clang::include_cleaner
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -9482,13 +9482,6 @@ Value *CodeGenFunction::EmitSVEGatherLoad(const SVETypeFlags &TypeFlags,
   auto *OverloadedTy =
       llvm::ScalableVectorType::get(SVEBuiltinMemEltTy(TypeFlags), ResultTy);
 
-  // At the ACLE level there's only one predicate type, svbool_t, which is
-  // mapped to <n x 16 x i1>. However, this might be incompatible with the
-  // actual type being loaded. For example, when loading doubles (i64) the
-  // predicated should be <n x 2 x i1> instead. At the IR level the type of
-  // the predicate and the data being loaded must match. Cast accordingly.
-  Ops[0] = EmitSVEPredicateCast(Ops[0], OverloadedTy);
-
   Function *F = nullptr;
   if (Ops[1]->getType()->isVectorTy())
     // This is the "vector base, scalar offset" case. In order to uniquely
@@ -9502,6 +9495,16 @@ Value *CodeGenFunction::EmitSVEGatherLoad(const SVETypeFlags &TypeFlags,
     // intrinsic.
     F = CGM.getIntrinsic(IntID, OverloadedTy);
 
+  // At the ACLE level there's only one predicate type, svbool_t, which is
+  // mapped to <n x 16 x i1>. However, this might be incompatible with the
+  // actual type being loaded. For example, when loading doubles (i64) the
+  // predicate should be <n x 2 x i1> instead. At the IR level the type of
+  // the predicate and the data being loaded must match. Cast to the type
+  // expected by the intrinsic. The intrinsic itself should be defined in
+  // a way than enforces relations between parameter types.
+  Ops[0] = EmitSVEPredicateCast(
+      Ops[0], cast<llvm::ScalableVectorType>(F->getArg(0)->getType()));
+
   // Pass 0 when the offset is missing. This can only be applied when using
   // the "vector base" addressing mode for which ACLE allows no offset. The
   // corresponding LLVM IR always requires an offset.
@@ -9566,8 +9569,11 @@ Value *CodeGenFunction::EmitSVEScatterStore(const SVETypeFlags &TypeFlags,
   // mapped to <n x 16 x i1>. However, this might be incompatible with the
   // actual type being stored. For example, when storing doubles (i64) the
   // predicated should be <n x 2 x i1> instead. At the IR level the type of
-  // the predicate and the data being stored must match. Cast accordingly.
-  Ops[1] = EmitSVEPredicateCast(Ops[1], OverloadedTy);
+  // the predicate and the data being stored must match. Cast to the type
+  // expected by the intrinsic. The intrinsic itself should be defined in
+  // a way that enforces relations between parameter types.
+  Ops[1] = EmitSVEPredicateCast(
+      Ops[1], cast<llvm::ScalableVectorType>(F->getArg(1)->getType()));
 
   // For "vector base, scalar index" scale the index so that it becomes a
   // scalar offset.
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2224,18 +2224,8 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
     return Visit(const_cast<Expr*>(E));
 
   case CK_NoOp: {
-    llvm::Value *V = CE->changesVolatileQualification()
-                         ? EmitLoadOfLValue(CE)
-                         : Visit(const_cast<Expr *>(E));
-    if (V) {
-      // CK_NoOp can model a pointer qualification conversion, which can remove
-      // an array bound and change the IR type.
-      // FIXME: Once pointee types are removed from IR, remove this.
-      llvm::Type *T = ConvertType(DestTy);
-      if (T != V->getType())
-        V = Builder.CreateBitCast(V, T);
-    }
-    return V;
+    return CE->changesVolatileQualification() ? EmitLoadOfLValue(CE)
+                                              : Visit(const_cast<Expr *>(E));
   }
 
   case CK_BaseToDerived: {
diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake
@@ -89,6 +89,7 @@ function(get_nvptx_compile_options output_var gpu_arch)
   set(nvptx_options "")
   list(APPEND nvptx_options "-march=${gpu_arch}")
   list(APPEND nvptx_options "-Wno-unknown-cuda-version")
+  list(APPEND nvptx_options "SHELL:-mllvm -nvptx-emit-init-fini-kernel=false")
   if(${gpu_arch} STREQUAL "sm_35")
     list(APPEND nvptx_options "--cuda-feature=+ptx60")
   elseif(${gpu_arch} STREQUAL "sm_37")
diff --git a/llvm/include/llvm/Config/llvm-config.h.cmake b/llvm/include/llvm/Config/llvm-config.h.cmake
@@ -16,7 +16,7 @@
 
 /* Indicate that this is LLVM compiled from the amd-gfx branch. */
 #define LLVM_HAVE_BRANCH_AMD_GFX
-#define LLVM_MAIN_REVISION 480586
+#define LLVM_MAIN_REVISION 480604
 
 /* Define if LLVM_ENABLE_DUMP is enabled */
 #cmakedefine LLVM_ENABLE_DUMP
diff --git a/llvm/lib/CodeGen/LiveIntervalUnion.cpp b/llvm/lib/CodeGen/LiveIntervalUnion.cpp
@@ -14,7 +14,6 @@
 
 #include "llvm/CodeGen/LiveIntervalUnion.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SparseBitVector.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/llvm/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp b/llvm/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/Native/NamedStreamMap.h"
-#include "llvm/ADT/SparseBitVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/PDB/Native/Hash.h"
diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -30,7 +30,6 @@
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/SparseBitVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorBase.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorBase.td
@@ -18,31 +18,35 @@ def Tensor_Dialect : Dialect {
   let description = [{
     The `tensor` dialect is intended to hold core tensor creation and
     manipulation ops, which are not strongly associated with any particular
-    other dialect or domain abstraction. The primary smoke test of this is ops
-    that make sense for any tensor element type.
-
-    We leave it to other dialects to hold the vast swath of possible
-    computations one might want to do on a tensor.
-
-    The `tensor` type is (for better or for worse) used to represent all kinds
-    of things, and supports an open-ended set of element types. Examples:
+    other dialect or domain abstraction. The aim for ops in this dialect is
+    that they make sense for any tensor element type. When this is not the
+    case, the op is left to live in other dialects. Examples of element types
+    that could be supported by the `tensor` dialect include:
 
     - representing large, dense aggregations of primitive types, suitable for
       high-performance numerical computing.
-    - representing shapes in the `shape` dialect, which consist of small
-      1D tensors of `index` data type.
+    - representing shapes in the `shape` dialect, which consist of small 1D
+      tensors of `index` data type.
     - representing aggregations of strings or “variant” types.
-    - representing large, sparse aggregations of primitive types, suitable
-      for high-performance numerical computing.
-
-    Thus, for the `tensor` dialect, we prefer for now to constrain the
-    scope as much as possible. The expectation is that at some point
-    in the future, the `tensor` dialect’s scope may be broadened through a
-    careful discussion of the tradeoffs.
-
-    The `tensor` type is actually a builtin type (it lives in the builtin
-    dialect), and does not live in this dialect.
+    - representing large, sparse aggregations of primitive types, suitable for
+      high-performance numerical computing.
 
+    Because of this broad element type support and because of the existence of
+    more dedicated dialects, such as the `sparse_tensor` and `linalg` dialects,
+    we prefer for now to keep the `tensor` dialect as small as possible. The
+    expectation is that at some point in the future, the `tensor` dialect’s
+    scope may be broadened through a careful discussion of the tradeoffs.
+
+    On the `tensor` type itself, note that it is actually a builtin type (it
+    lives in the builtin dialect), and does not live in this dialect.
+    Furthermore, a `tensor` is an immutable object. For example, this means
+    that a copy will always be made of the `tensor` object when it is passed to
+    the `dest` operand used by some ops in this dialect. As an optimization,
+    an implementation can eliminate these copies during lowering when they
+    are redundant and perform in-place mutation, see the [Destination-Passing
+    Style](
+    https://mlir.llvm.org/docs/Bufferization/#destination-passing-style)
+    documentation for more information.
   }];
 
   let hasCanonicalizer = 1;