GPUOpen-Drivers
diff --git a/‎clang/lib/CodeGen/CodeGenFunction.h
Lines changed: 2 additions & 1 deletion b/‎clang/lib/CodeGen/CodeGenFunction.h
Lines changed: 2 additions & 1 deletion
diff --git a/‎clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
Lines changed: 5 additions & 5 deletions b/‎clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
Lines changed: 5 additions & 5 deletions
diff --git a/‎llvm/include/llvm/Config/llvm-config.h.cmake
Lines changed: 1 addition & 1 deletion b/‎llvm/include/llvm/Config/llvm-config.h.cmake
Lines changed: 1 addition & 1 deletion
diff --git a/‎llvm/include/llvm/TableGen/Record.h
Lines changed: 1 addition & 1 deletion b/‎llvm/include/llvm/TableGen/Record.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
Lines changed: 29 additions & 13 deletions b/‎llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
Lines changed: 29 additions & 13 deletions
diff --git a/‎llvm/lib/TableGen/Record.cpp
Lines changed: 1 addition & 3 deletions b/‎llvm/lib/TableGen/Record.cpp
Lines changed: 1 addition & 3 deletions
diff --git a/‎llvm/lib/Target/X86/X86ISelLowering.cpp
Lines changed: 70 additions & 30 deletions b/‎llvm/lib/Target/X86/X86ISelLowering.cpp
Lines changed: 70 additions & 30 deletions
diff --git a/‎llvm/test/CodeGen/X86/atomic-non-integer-fp128.ll
Lines changed: 3 additions & 28 deletions b/‎llvm/test/CodeGen/X86/atomic-non-integer-fp128.ll
Lines changed: 3 additions & 28 deletions
diff --git a/‎llvm/test/CodeGen/X86/atomic-non-integer.ll
Lines changed: 0 additions & 1 deletion b/‎llvm/test/CodeGen/X86/atomic-non-integer.ll
Lines changed: 0 additions & 1 deletion
@@ -137,7 +137,8 @@ enum TypeEvaluationKind {
   SANITIZER_CHECK(SubOverflow, sub_overflow, 0)                                \
   SANITIZER_CHECK(TypeMismatch, type_mismatch, 1)                              \
   SANITIZER_CHECK(AlignmentAssumption, alignment_assumption, 0)                \
-  SANITIZER_CHECK(VLABoundNotPositive, vla_bound_not_positive, 0)
+  SANITIZER_CHECK(VLABoundNotPositive, vla_bound_not_positive, 0)              \
+  SANITIZER_CHECK(BoundsSafety, bounds_safety, 0)
 
 enum SanitizerHandler {
 #define SANITIZER_CHECK(Enum, Name, Version) Enum,
 
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -Wno-c++11-narrowing -Wno-literal-conversion -std=c++20 -verify %s
+// RUN: %clang_cc1 -fsyntax-only -triple x86_64-unknown-linux -Wno-c++11-narrowing -Wno-literal-conversion -std=c++20 -verify %s
 
 namespace test1 {
 template <typename T>
@@ -74,7 +74,7 @@ struct Foo {
 template <typename T>
 using AF = Foo<T, 1>;
 
-AF b{0}; 
+AF b{0};
 }  // namespace test6
 
 namespace test7 {
@@ -86,8 +86,8 @@ struct Foo {
 template <typename U>
 using AF1 = Foo<U>;
 template <typename K>
-using AF2 = AF1<K>;  
-AF2 b = 1;  
+using AF2 = AF1<K>;
+AF2 b = 1;
 }  // namespace test7
 
 namespace test8 {
@@ -149,7 +149,7 @@ namespace test12 {
 template<typename X>
 struct Foo {
   template<typename K>
-  struct Bar { 
+  struct Bar {
     Bar(K);
   };
 
 
@@ -16,7 +16,7 @@
 
 /* Indicate that this is LLVM compiled from the amd-gfx branch. */
 #define LLVM_HAVE_BRANCH_AMD_GFX
-#define LLVM_MAIN_REVISION 498973
+#define LLVM_MAIN_REVISION 498980
 
 /* Define if LLVM_ENABLE_DUMP is enabled */
 #cmakedefine LLVM_ENABLE_DUMP
 
@@ -2098,7 +2098,7 @@ class RecordKeeper {
 /// Sorting predicate to sort record pointers by name.
 struct LessRecord {
   bool operator()(const Record *Rec1, const Record *Rec2) const {
-    return StringRef(Rec1->getName()).compare_numeric(Rec2->getName()) < 0;
+    return Rec1->getName().compare_numeric(Rec2->getName()) < 0;
   }
 };
 
 
@@ -95,9 +95,12 @@ extern cl::opt<unsigned> MaxNumVTableAnnotations;
 // global vars at all. When importing function we aren't interested if any
 // instruction in it takes an address of any basic block, because instruction
 // can only take an address of basic block located in the same function.
+// Set `RefLocalLinkageIFunc` to true if the analyzed value references a
+// local-linkage ifunc.
 static bool findRefEdges(ModuleSummaryIndex &Index, const User *CurUser,
                          SetVector<ValueInfo, std::vector<ValueInfo>> &RefEdges,
-                         SmallPtrSet<const User *, 8> &Visited) {
+                         SmallPtrSet<const User *, 8> &Visited,
+                         bool &RefLocalLinkageIFunc) {
   bool HasBlockAddress = false;
   SmallVector<const User *, 32> Worklist;
   if (Visited.insert(CurUser).second)
@@ -119,8 +122,18 @@ static bool findRefEdges(ModuleSummaryIndex &Index, const User *CurUser,
         // We have a reference to a global value. This should be added to
         // the reference set unless it is a callee. Callees are handled
         // specially by WriteFunction and are added to a separate list.
-        if (!(CB && CB->isCallee(&OI)))
+        if (!(CB && CB->isCallee(&OI))) {
+          // If an ifunc has local linkage, do not add it into ref edges, and
+          // sets `RefLocalLinkageIFunc` to true. The referencer is not eligible
+          // for import. An ifunc doesn't have summary and ThinLTO cannot
+          // promote it; importing the referencer may cause linkage errors.
+          if (auto *GI = dyn_cast_if_present<GlobalIFunc>(GV);
+              GI && GI->hasLocalLinkage()) {
+            RefLocalLinkageIFunc = true;
+            continue;
+          }
           RefEdges.insert(Index.getOrInsertValueInfo(GV));
+        }
         continue;
       }
       if (Visited.insert(Operand).second)
@@ -313,7 +326,8 @@ static void computeFunctionSummary(
 
   // Add personality function, prefix data and prologue data to function's ref
   // list.
-  findRefEdges(Index, &F, RefEdges, Visited);
+  bool HasLocalIFuncCallOrRef = false;
+  findRefEdges(Index, &F, RefEdges, Visited, HasLocalIFuncCallOrRef);
   std::vector<const Instruction *> NonVolatileLoads;
   std::vector<const Instruction *> NonVolatileStores;
 
@@ -326,7 +340,6 @@ static void computeFunctionSummary(
 
   bool HasInlineAsmMaybeReferencingInternal = false;
   bool HasIndirBranchToBlockAddress = false;
-  bool HasIFuncCall = false;
   bool HasUnknownCall = false;
   bool MayThrow = false;
   for (const BasicBlock &BB : F) {
@@ -372,11 +385,11 @@ static void computeFunctionSummary(
             // of calling it we should add GV to RefEdges directly.
             RefEdges.insert(Index.getOrInsertValueInfo(GV));
           else if (auto *U = dyn_cast<User>(Stored))
-            findRefEdges(Index, U, RefEdges, Visited);
+            findRefEdges(Index, U, RefEdges, Visited, HasLocalIFuncCallOrRef);
           continue;
         }
       }
-      findRefEdges(Index, &I, RefEdges, Visited);
+      findRefEdges(Index, &I, RefEdges, Visited, HasLocalIFuncCallOrRef);
       const auto *CB = dyn_cast<CallBase>(&I);
       if (!CB) {
         if (I.mayThrow())
@@ -450,7 +463,7 @@ static void computeFunctionSummary(
         // Non-local ifunc is not cloned and does not have the issue.
         if (auto *GI = dyn_cast_if_present<GlobalIFunc>(CalledValue))
           if (GI->hasLocalLinkage())
-            HasIFuncCall = true;
+            HasLocalIFuncCallOrRef = true;
         // Skip inline assembly calls.
         if (CI && CI->isInlineAsm())
           continue;
@@ -555,7 +568,7 @@ static void computeFunctionSummary(
                            SmallPtrSet<const User *, 8> &Cache) {
       for (const auto *I : Instrs) {
         Cache.erase(I);
-        findRefEdges(Index, I, Edges, Cache);
+        findRefEdges(Index, I, Edges, Cache, HasLocalIFuncCallOrRef);
       }
     };
 
@@ -631,9 +644,9 @@ static void computeFunctionSummary(
 #endif
 
   bool NonRenamableLocal = isNonRenamableLocal(F);
-  bool NotEligibleForImport = NonRenamableLocal ||
-                              HasInlineAsmMaybeReferencingInternal ||
-                              HasIndirBranchToBlockAddress || HasIFuncCall;
+  bool NotEligibleForImport =
+      NonRenamableLocal || HasInlineAsmMaybeReferencingInternal ||
+      HasIndirBranchToBlockAddress || HasLocalIFuncCallOrRef;
   GlobalValueSummary::GVFlags Flags(
       F.getLinkage(), F.getVisibility(), NotEligibleForImport,
       /* Live = */ false, F.isDSOLocal(), F.canBeOmittedFromSymbolTable(),
@@ -787,7 +800,10 @@ static void computeVariableSummary(ModuleSummaryIndex &Index,
                                    SmallVectorImpl<MDNode *> &Types) {
   SetVector<ValueInfo, std::vector<ValueInfo>> RefEdges;
   SmallPtrSet<const User *, 8> Visited;
-  bool HasBlockAddress = findRefEdges(Index, &V, RefEdges, Visited);
+  bool RefLocalIFunc = false;
+  bool HasBlockAddress =
+      findRefEdges(Index, &V, RefEdges, Visited, RefLocalIFunc);
+  const bool NotEligibleForImport = (HasBlockAddress || RefLocalIFunc);
   bool NonRenamableLocal = isNonRenamableLocal(V);
   GlobalValueSummary::GVFlags Flags(
       V.getLinkage(), V.getVisibility(), NonRenamableLocal,
@@ -821,7 +837,7 @@ static void computeVariableSummary(ModuleSummaryIndex &Index,
                                                          RefEdges.takeVector());
   if (NonRenamableLocal)
     CantBePromoted.insert(V.getGUID());
-  if (HasBlockAddress)
+  if (NotEligibleForImport)
     GVarSummary->setNotEligibleToImport();
   if (!VTableFuncs.empty())
     GVarSummary->setVTableFuncs(VTableFuncs);
 
@@ -3251,9 +3251,7 @@ std::vector<Record *> RecordKeeper::getAllDerivedDefinitions(
       Defs.push_back(OneDef.second.get());
   }
 
-  llvm::sort(Defs, [](Record *LHS, Record *RHS) {
-    return LHS->getName().compare_numeric(RHS->getName()) < 0;
-  });
+  llvm::sort(Defs, LessRecord());
 
   return Defs;
 }
 
@@ -544,6 +544,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   if (!Subtarget.is64Bit())
     setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
 
+  if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
+    // All CPUs supporting AVX will atomically load/store aligned 128-bit
+    // values, so we can emit [V]MOVAPS/[V]MOVDQA.
+    setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
+    setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
+  }
+
   if (Subtarget.canUseCMPXCHG16B())
     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
 
@@ -30415,32 +30422,40 @@ TargetLoweringBase::AtomicExpansionKind
 X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   Type *MemType = SI->getValueOperand()->getType();
 
-  bool NoImplicitFloatOps =
-      SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
-  if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
-      !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
-      (Subtarget.hasSSE1() || Subtarget.hasX87()))
-    return AtomicExpansionKind::None;
+  if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
+      !Subtarget.useSoftFloat()) {
+    if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
+        (Subtarget.hasSSE1() || Subtarget.hasX87()))
+      return AtomicExpansionKind::None;
+
+    if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
+        Subtarget.hasAVX())
+      return AtomicExpansionKind::None;
+  }
 
   return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
                                  : AtomicExpansionKind::None;
 }
 
 // Note: this turns large loads into lock cmpxchg8b/16b.
-// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
 TargetLowering::AtomicExpansionKind
 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   Type *MemType = LI->getType();
 
-  // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
-  // can use movq to do the load. If we have X87 we can load into an 80-bit
-  // X87 register and store it to a stack temporary.
-  bool NoImplicitFloatOps =
-      LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
-  if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
-      !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
-      (Subtarget.hasSSE1() || Subtarget.hasX87()))
-    return AtomicExpansionKind::None;
+  if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
+      !Subtarget.useSoftFloat()) {
+    // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
+    // can use movq to do the load. If we have X87 we can load into an 80-bit
+    // X87 register and store it to a stack temporary.
+    if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
+        (Subtarget.hasSSE1() || Subtarget.hasX87()))
+      return AtomicExpansionKind::None;
+
+    // If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic.
+    if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
+        Subtarget.hasAVX())
+      return AtomicExpansionKind::None;
+  }
 
   return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
                                  : AtomicExpansionKind::None;
@@ -31683,14 +31698,21 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
   if (!IsSeqCst && IsTypeLegal)
     return Op;
 
-  if (VT == MVT::i64 && !IsTypeLegal) {
+  if (!IsTypeLegal && !Subtarget.useSoftFloat() &&
+      !DAG.getMachineFunction().getFunction().hasFnAttribute(
+          Attribute::NoImplicitFloat)) {
+    SDValue Chain;
+    // For illegal i128 atomic_store, when AVX is enabled, we can simply emit a
+    // vector store.
+    if (VT == MVT::i128 && Subtarget.is64Bit() && Subtarget.hasAVX()) {
+      SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
+      Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
+                           Node->getMemOperand());
+    }
+
     // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
     // is enabled.
-    bool NoImplicitFloatOps =
-        DAG.getMachineFunction().getFunction().hasFnAttribute(
-            Attribute::NoImplicitFloat);
-    if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
-      SDValue Chain;
+    if (VT == MVT::i64) {
       if (Subtarget.hasSSE1()) {
         SDValue SclToVec =
             DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
@@ -31722,15 +31744,15 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
             DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
                                     StoreOps, MVT::i64, Node->getMemOperand());
       }
+    }
 
-      if (Chain) {
-        // If this is a sequentially consistent store, also emit an appropriate
-        // barrier.
-        if (IsSeqCst)
-          Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
+    if (Chain) {
+      // If this is a sequentially consistent store, also emit an appropriate
+      // barrier.
+      if (IsSeqCst)
+        Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
 
-        return Chain;
-      }
+      return Chain;
     }
   }
 
@@ -33303,12 +33325,30 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     return;
   }
   case ISD::ATOMIC_LOAD: {
-    assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
+    assert(
+        (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
+        "Unexpected VT!");
     bool NoImplicitFloatOps =
         DAG.getMachineFunction().getFunction().hasFnAttribute(
             Attribute::NoImplicitFloat);
     if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
       auto *Node = cast<AtomicSDNode>(N);
+
+      if (N->getValueType(0) == MVT::i128) {
+        if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
+          SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
+                                   Node->getBasePtr(), Node->getMemOperand());
+          SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
+                                     DAG.getIntPtrConstant(0, dl));
+          SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
+                                     DAG.getIntPtrConstant(1, dl));
+          Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
+                                        {ResL, ResH}));
+          Results.push_back(Ld.getValue(1));
+          return;
+        }
+        break;
+      }
       if (Subtarget.hasSSE1()) {
         // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
         // Then extract the lower 64-bits.
 
@@ -28,22 +28,7 @@ define void @store_fp128(ptr %fptr, fp128 %v) {
 ;
 ; X64-AVX-LABEL: store_fp128:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    pushq %rbx
-; X64-AVX-NEXT:    .cfi_def_cfa_offset 16
-; X64-AVX-NEXT:    .cfi_offset %rbx, -16
-; X64-AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rbx
-; X64-AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; X64-AVX-NEXT:    movq (%rdi), %rax
-; X64-AVX-NEXT:    movq 8(%rdi), %rdx
-; X64-AVX-NEXT:    .p2align 4, 0x90
-; X64-AVX-NEXT:  .LBB0_1: # %atomicrmw.start
-; X64-AVX-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-AVX-NEXT:    lock cmpxchg16b (%rdi)
-; X64-AVX-NEXT:    jne .LBB0_1
-; X64-AVX-NEXT:  # %bb.2: # %atomicrmw.end
-; X64-AVX-NEXT:    popq %rbx
-; X64-AVX-NEXT:    .cfi_def_cfa_offset 8
+; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
 ; X64-AVX-NEXT:    retq
   store atomic fp128 %v, ptr %fptr unordered, align 16
   ret void
@@ -69,19 +54,9 @@ define fp128 @load_fp128(ptr %fptr) {
 ;
 ; X64-AVX-LABEL: load_fp128:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    pushq %rbx
-; X64-AVX-NEXT:    .cfi_def_cfa_offset 16
-; X64-AVX-NEXT:    .cfi_offset %rbx, -16
-; X64-AVX-NEXT:    xorl %eax, %eax
-; X64-AVX-NEXT:    xorl %edx, %edx
-; X64-AVX-NEXT:    xorl %ecx, %ecx
-; X64-AVX-NEXT:    xorl %ebx, %ebx
-; X64-AVX-NEXT:    lock cmpxchg16b (%rdi)
-; X64-AVX-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    vmovaps (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-AVX-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
-; X64-AVX-NEXT:    popq %rbx
-; X64-AVX-NEXT:    .cfi_def_cfa_offset 8
 ; X64-AVX-NEXT:    retq
   %v = load atomic fp128, ptr %fptr unordered, align 16
   ret fp128 %v
 
@@ -131,7 +131,6 @@ define void @store_double(ptr %fptr, double %v) {
   ret void
 }
 
-
 define half @load_half(ptr %fptr) {
 ; X86-SSE1-LABEL: load_half:
 ; X86-SSE1:       # %bb.0:
Original file line number	Diff line number	Diff line change
`@@ -2098,7 +2098,7 @@ class RecordKeeper {`
`2098`	`2098`	`/// Sorting predicate to sort record pointers by name.`
`2099`	`2099`	`struct LessRecord {`
`2100`	`2100`	`bool operator()(const Record Rec1, const Record Rec2) const {`
`2101`		`- return StringRef(Rec1->getName()).compare_numeric(Rec2->getName()) < 0;`
	`2101`	`+ return Rec1->getName().compare_numeric(Rec2->getName()) < 0;`
`2102`	`2102`	`}`
`2103`	`2103`	`};`
`2104`	`2104`
Original file line number	Diff line number	Diff line change
`@@ -3251,9 +3251,7 @@ std::vector<Record *> RecordKeeper::getAllDerivedDefinitions(`
`3251`	`3251`	`Defs.push_back(OneDef.second.get());`
`3252`	`3252`	`}`
`3253`	`3253`
`3254`		`- llvm::sort(Defs, [](Record LHS, Record RHS) {`
`3255`		`- return LHS->getName().compare_numeric(RHS->getName()) < 0;`
`3256`		`- });`
	`3254`	`+ llvm::sort(Defs, LessRecord());`
`3257`	`3255`
`3258`	`3256`	`return Defs;`
`3259`	`3257`	`}`
Original file line number	Diff line number	Diff line change
`@@ -131,7 +131,6 @@ define void @store_double(ptr %fptr, double %v) {`
`131`	`131`	`ret void`
`132`	`132`	`}`
`133`	`133`
`134`		`-`
`135`	`134`	`define half @load_half(ptr %fptr) {`
`136`	`135`	`; X86-SSE1-LABEL: load_half:`
`137`	`136`	`; X86-SSE1: # %bb.0:`