llvm · efriedma-quic · Feb 25, 2025 · Feb 10, 2025 · Feb 12, 2025
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1308,11 +1308,28 @@ bool MachineInstr::isSafeToMove(bool &SawStore) const {
     return false;
   }
 
+  // Don't touch instructions that have non-trivial invariants.  For example,
+  // terminators have to be at the end of a basic block.
   if (isPosition() || isDebugInstr() || isTerminator() ||
-      mayRaiseFPException() || hasUnmodeledSideEffects() ||
       isJumpTableDebugInfo())
     return false;
 
+  // Don't touch instructions which can have non-load/store effects.
+  //
+  // Inline asm has a "sideeffect" marker to indicate whether the asm has
+  // intentional side-effects. Even if an inline asm is not "sideeffect",
+  // though, it still can't be speculatively executed: the operation might
+  // not be valid on the current target, or for some combinations of operands.
+  // (Some transforms that move an instruction don't speculatively execute it;
+  // we currently don't try to handle that distinction here.)
+  //
+  // Other instructions handled here include those that can raise FP
+  // exceptions, x86 "DIV" instructions which trap on divide by zero, and
+  // stack adjustments.
+  if (mayRaiseFPException() || hasProperty(MCID::UnmodeledSideEffects) ||
+      isInlineAsm())
+    return false;
+
   // See if this instruction does a load.  If so, we have to guarantee that the
   // loaded value doesn't change between the load and the its intended
   // destination. The check for isInvariantLoad gives the target the chance to

diff --git a/llvm/test/CodeGen/AArch64/inline-asm-speculation.ll b/llvm/test/CodeGen/AArch64/inline-asm-speculation.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+; Make sure we don't speculatively execute inline asm statements.
+
+define dso_local i32 @main(i32 %argc, ptr %argv) {
+; CHECK-LABEL: main:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w0, #3
+; CHECK-NEXT:    b.ne .LBB0_2
+; CHECK-NEXT:  // %bb.1: // %if.then
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    mrs x0, SPSR_EL2
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_2: // %if.else
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    mrs x0, SPSR_EL1
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+entry:
+  %cmp = icmp eq i32 %argc, 3
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %0 = tail call i64 asm "mrs $0, SPSR_EL2", "=r"()
+  br label %if.end
+
+if.else:
+  %1 = tail call i64 asm "mrs $0, SPSR_EL1", "=r"()
+  br label %if.end
+
+if.end:
+  %y.0.in = phi i64 [ %0, %if.then ], [ %1, %if.else ]
+  %y.0 = trunc i64 %y.0.in to i32
+  ret i32 %y.0
+}
diff --git a/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll b/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll
@@ -23,12 +23,8 @@ bb5:                                              ; preds = %bb3, %bb
 }
 
 ; GCN-LABEL: {{^}}nonconvergent_inlineasm:
-; GCN: s_cbranch_execz
-
-; GCN: ; %bb.{{[0-9]+}}:
 ; GCN: v_cmp_ne_u32_e64
-
-; GCN: BB{{[0-9]+_[0-9]+}}:
+; GCN: s_cbranch_execz
 
 define amdgpu_kernel void @nonconvergent_inlineasm(ptr addrspace(1) nocapture %arg) {
 bb:

diff --git a/llvm/test/CodeGen/AMDGPU/early-if-convert.ll b/llvm/test/CodeGen/AMDGPU/early-if-convert.ll
@@ -55,11 +55,8 @@ endif:
 }
 
 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_vcc_clobber:
+; GCN: s_cbranch_vccnz
 ; GCN: ; clobber vcc
-; GCN: v_cmp_neq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0
-; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc
-; GCN: s_mov_b64 vcc, [[CMP]]
-; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_vcc_clobber(ptr addrspace(1) %out, ptr addrspace(1) %in, float %k) #0 {
 entry:
   %v = load i32, ptr addrspace(1) %in

diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -2015,13 +2015,14 @@ define amdgpu_kernel void @sdwa_crash_inlineasm_def() #0 {
 ; NOSDWA-LABEL: sdwa_crash_inlineasm_def:
 ; NOSDWA:       ; %bb.0: ; %bb
 ; NOSDWA-NEXT:    s_mov_b32 s0, 0xffff
+; NOSDWA-NEXT:    s_and_b64 vcc, exec, -1
+; NOSDWA-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; NOSDWA-NEXT:  .LBB21_1: ; %bb1
+; NOSDWA-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; NOSDWA-NEXT:    ;;#ASMSTART
 ; NOSDWA-NEXT:    v_and_b32_e32 v0, s0, v0
 ; NOSDWA-NEXT:    ;;#ASMEND
 ; NOSDWA-NEXT:    v_or_b32_e32 v0, 0x10000, v0
-; NOSDWA-NEXT:    s_and_b64 vcc, exec, -1
-; NOSDWA-NEXT:  .LBB21_1: ; %bb1
-; NOSDWA-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; NOSDWA-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
 ; NOSDWA-NEXT:    s_waitcnt vmcnt(0)
 ; NOSDWA-NEXT:    s_mov_b64 vcc, vcc
@@ -2032,13 +2033,14 @@ define amdgpu_kernel void @sdwa_crash_inlineasm_def() #0 {
 ; GFX89-LABEL: sdwa_crash_inlineasm_def:
 ; GFX89:       ; %bb.0: ; %bb
 ; GFX89-NEXT:    s_mov_b32 s0, 0xffff
+; GFX89-NEXT:    s_and_b64 vcc, exec, -1
+; GFX89-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX89-NEXT:  .LBB21_1: ; %bb1
+; GFX89-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX89-NEXT:    ;;#ASMSTART
 ; GFX89-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX89-NEXT:    ;;#ASMEND
 ; GFX89-NEXT:    v_or_b32_e32 v0, 0x10000, v0
-; GFX89-NEXT:    s_and_b64 vcc, exec, -1
-; GFX89-NEXT:  .LBB21_1: ; %bb1
-; GFX89-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX89-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
 ; GFX89-NEXT:    s_waitcnt vmcnt(0)
 ; GFX89-NEXT:    s_mov_b64 vcc, vcc
@@ -2049,13 +2051,14 @@ define amdgpu_kernel void @sdwa_crash_inlineasm_def() #0 {
 ; GFX9-LABEL: sdwa_crash_inlineasm_def:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    s_and_b64 vcc, exec, -1
+; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX9-NEXT:  .LBB21_1: ; %bb1
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    v_or_b32_e32 v0, 0x10000, v0
-; GFX9-NEXT:    s_and_b64 vcc, exec, -1
-; GFX9-NEXT:  .LBB21_1: ; %bb1
-; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_mov_b64 vcc, vcc
@@ -2066,13 +2069,14 @@ define amdgpu_kernel void @sdwa_crash_inlineasm_def() #0 {
 ; GFX10-LABEL: sdwa_crash_inlineasm_def:
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_mov_b32 s0, 0xffff
+; GFX10-NEXT:    s_mov_b32 vcc_lo, exec_lo
+; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX10-NEXT:  .LBB21_1: ; %bb1
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    v_and_b32_e32 v0, s0, v0
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    v_or_b32_e32 v0, 0x10000, v0
-; GFX10-NEXT:    s_mov_b32 vcc_lo, exec_lo
-; GFX10-NEXT:  .LBB21_1: ; %bb1
-; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB21_1

diff --git a/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll b/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll
@@ -1601,11 +1601,11 @@ define void @infiniteloop2() "frame-pointer"="all" {
 ; ARM-ENABLE-NEXT:  @ %bb.1: @ %if.then
 ; ARM-ENABLE-NEXT:    sub r1, sp, #16
 ; ARM-ENABLE-NEXT:    mov sp, r1
+; ARM-ENABLE-NEXT:  LBB10_2: @ %for.body
+; ARM-ENABLE-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; ARM-ENABLE-NEXT:    @ InlineAsm Start
 ; ARM-ENABLE-NEXT:    mov r2, #0
 ; ARM-ENABLE-NEXT:    @ InlineAsm End
-; ARM-ENABLE-NEXT:  LBB10_2: @ %for.body
-; ARM-ENABLE-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; ARM-ENABLE-NEXT:    add r0, r2, r0
 ; ARM-ENABLE-NEXT:    str r0, [r1]
 ; ARM-ENABLE-NEXT:    @ InlineAsm Start
@@ -1629,11 +1629,11 @@ define void @infiniteloop2() "frame-pointer"="all" {
 ; ARM-DISABLE-NEXT:  @ %bb.1: @ %if.then
 ; ARM-DISABLE-NEXT:    sub r1, sp, #16
 ; ARM-DISABLE-NEXT:    mov sp, r1
+; ARM-DISABLE-NEXT:  LBB10_2: @ %for.body
+; ARM-DISABLE-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; ARM-DISABLE-NEXT:    @ InlineAsm Start
 ; ARM-DISABLE-NEXT:    mov r2, #0
 ; ARM-DISABLE-NEXT:    @ InlineAsm End
-; ARM-DISABLE-NEXT:  LBB10_2: @ %for.body
-; ARM-DISABLE-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; ARM-DISABLE-NEXT:    add r0, r2, r0
 ; ARM-DISABLE-NEXT:    str r0, [r1]
 ; ARM-DISABLE-NEXT:    @ InlineAsm Start
@@ -1657,11 +1657,11 @@ define void @infiniteloop2() "frame-pointer"="all" {
 ; THUMB-ENABLE-NEXT:    sub.w r0, sp, #16
 ; THUMB-ENABLE-NEXT:    mov sp, r0
 ; THUMB-ENABLE-NEXT:    movs r1, #0
+; THUMB-ENABLE-NEXT:  LBB10_2: @ %for.body
+; THUMB-ENABLE-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; THUMB-ENABLE-NEXT:    @ InlineAsm Start
 ; THUMB-ENABLE-NEXT:    mov.w r2, #0
 ; THUMB-ENABLE-NEXT:    @ InlineAsm End
-; THUMB-ENABLE-NEXT:  LBB10_2: @ %for.body
-; THUMB-ENABLE-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; THUMB-ENABLE-NEXT:    add r1, r2
 ; THUMB-ENABLE-NEXT:    str r1, [r0]
 ; THUMB-ENABLE-NEXT:    @ InlineAsm Start
@@ -1686,11 +1686,11 @@ define void @infiniteloop2() "frame-pointer"="all" {
 ; THUMB-DISABLE-NEXT:    sub.w r0, sp, #16
 ; THUMB-DISABLE-NEXT:    mov sp, r0
 ; THUMB-DISABLE-NEXT:    movs r1, #0
+; THUMB-DISABLE-NEXT:  LBB10_2: @ %for.body
+; THUMB-DISABLE-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; THUMB-DISABLE-NEXT:    @ InlineAsm Start
 ; THUMB-DISABLE-NEXT:    mov.w r2, #0
 ; THUMB-DISABLE-NEXT:    @ InlineAsm End
-; THUMB-DISABLE-NEXT:  LBB10_2: @ %for.body
-; THUMB-DISABLE-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; THUMB-DISABLE-NEXT:    add r1, r2
 ; THUMB-DISABLE-NEXT:    str r1, [r0]
 ; THUMB-DISABLE-NEXT:    @ InlineAsm Start

diff --git a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -821,12 +821,12 @@ define void @infiniteloop() {
 ; ENABLE-NEXT:    movq %rsp, %rcx
 ; ENABLE-NEXT:    addq $-16, %rcx
 ; ENABLE-NEXT:    movq %rcx, %rsp
-; ENABLE-NEXT:    ## InlineAsm Start
-; ENABLE-NEXT:    movl $1, %edx
-; ENABLE-NEXT:    ## InlineAsm End
 ; ENABLE-NEXT:    .p2align 4
 ; ENABLE-NEXT:  LBB10_2: ## %for.body
 ; ENABLE-NEXT:    ## =>This Inner Loop Header: Depth=1
+; ENABLE-NEXT:    ## InlineAsm Start
+; ENABLE-NEXT:    movl $1, %edx
+; ENABLE-NEXT:    ## InlineAsm End
 ; ENABLE-NEXT:    addl %edx, %eax
 ; ENABLE-NEXT:    movl %eax, (%rcx)
 ; ENABLE-NEXT:    jmp LBB10_2
@@ -853,12 +853,12 @@ define void @infiniteloop() {
 ; DISABLE-NEXT:    movq %rsp, %rcx
 ; DISABLE-NEXT:    addq $-16, %rcx
 ; DISABLE-NEXT:    movq %rcx, %rsp
-; DISABLE-NEXT:    ## InlineAsm Start
-; DISABLE-NEXT:    movl $1, %edx
-; DISABLE-NEXT:    ## InlineAsm End
 ; DISABLE-NEXT:    .p2align 4
 ; DISABLE-NEXT:  LBB10_2: ## %for.body
 ; DISABLE-NEXT:    ## =>This Inner Loop Header: Depth=1
+; DISABLE-NEXT:    ## InlineAsm Start
+; DISABLE-NEXT:    movl $1, %edx
+; DISABLE-NEXT:    ## InlineAsm End
 ; DISABLE-NEXT:    addl %edx, %eax
 ; DISABLE-NEXT:    movl %eax, (%rcx)
 ; DISABLE-NEXT:    jmp LBB10_2

diff --git a/llvm/test/CodeGen/X86/x86-win64-shrink-wrapping.ll b/llvm/test/CodeGen/X86/x86-win64-shrink-wrapping.ll
@@ -24,12 +24,12 @@ define i32 @loopInfoSaveOutsideLoop(i32 %cond, i32 %N) #0 {
 ; ENABLE-NEXT:    #NO_APP
 ; ENABLE-NEXT:    xorl %eax, %eax
 ; ENABLE-NEXT:    movl $10, %ecx
-; ENABLE-NEXT:    #APP
-; ENABLE-NEXT:    movl $1, %edx
-; ENABLE-NEXT:    #NO_APP
 ; ENABLE-NEXT:    .p2align 4
 ; ENABLE-NEXT:  .LBB0_2: # %for.body
 ; ENABLE-NEXT:    # =>This Inner Loop Header: Depth=1
+; ENABLE-NEXT:    #APP
+; ENABLE-NEXT:    movl $1, %edx
+; ENABLE-NEXT:    #NO_APP
 ; ENABLE-NEXT:    addl %edx, %eax
 ; ENABLE-NEXT:    decl %ecx
 ; ENABLE-NEXT:    jne .LBB0_2
@@ -64,12 +64,12 @@ define i32 @loopInfoSaveOutsideLoop(i32 %cond, i32 %N) #0 {
 ; DISABLE-NEXT:    #NO_APP
 ; DISABLE-NEXT:    xorl %eax, %eax
 ; DISABLE-NEXT:    movl $10, %ecx
-; DISABLE-NEXT:    #APP
-; DISABLE-NEXT:    movl $1, %edx
-; DISABLE-NEXT:    #NO_APP
 ; DISABLE-NEXT:    .p2align 4
 ; DISABLE-NEXT:  .LBB0_2: # %for.body
 ; DISABLE-NEXT:    # =>This Inner Loop Header: Depth=1
+; DISABLE-NEXT:    #APP
+; DISABLE-NEXT:    movl $1, %edx
+; DISABLE-NEXT:    #NO_APP
 ; DISABLE-NEXT:    addl %edx, %eax
 ; DISABLE-NEXT:    decl %ecx
 ; DISABLE-NEXT:    jne .LBB0_2