Skip to content

[NVPTX] cleanup & canonicalize mov #129344

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Mar 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -994,7 +994,7 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
Opc = TM.is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local;
break;
case ADDRESS_SPACE_PARAM:
Opc = TM.is64Bit() ? NVPTX::IMOV64rr : NVPTX::IMOV32rr;
Opc = TM.is64Bit() ? NVPTX::IMOV64r : NVPTX::IMOV32r;
break;
}

Expand Down Expand Up @@ -2151,10 +2151,10 @@ bool NVPTXDAGToDAGISel::tryBF16ArithToFMA(SDNode *N) {
auto API = APF.bitcastToAPInt();
API = API.concat(API);
auto Const = CurDAG->getTargetConstant(API, DL, MVT::i32);
return SDValue(CurDAG->getMachineNode(NVPTX::IMOV32ri, DL, VT, Const), 0);
return SDValue(CurDAG->getMachineNode(NVPTX::IMOV32i, DL, VT, Const), 0);
}
auto Const = CurDAG->getTargetConstantFP(APF, DL, VT);
return SDValue(CurDAG->getMachineNode(NVPTX::BFMOV16ri, DL, VT, Const), 0);
return SDValue(CurDAG->getMachineNode(NVPTX::BFMOV16i, DL, VT, Const), 0);
};

switch (N->getOpcode()) {
Expand Down
14 changes: 7 additions & 7 deletions llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,22 +40,22 @@ void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,

unsigned Op;
if (DestRC == &NVPTX::Int1RegsRegClass) {
Op = NVPTX::IMOV1rr;
Op = NVPTX::IMOV1r;
} else if (DestRC == &NVPTX::Int16RegsRegClass) {
Op = NVPTX::IMOV16rr;
Op = NVPTX::MOV16r;
} else if (DestRC == &NVPTX::Int32RegsRegClass) {
Op = (SrcRC == &NVPTX::Int32RegsRegClass ? NVPTX::IMOV32rr
Op = (SrcRC == &NVPTX::Int32RegsRegClass ? NVPTX::IMOV32r
: NVPTX::BITCONVERT_32_F2I);
} else if (DestRC == &NVPTX::Int64RegsRegClass) {
Op = (SrcRC == &NVPTX::Int64RegsRegClass ? NVPTX::IMOV64rr
Op = (SrcRC == &NVPTX::Int64RegsRegClass ? NVPTX::IMOV64r
: NVPTX::BITCONVERT_64_F2I);
} else if (DestRC == &NVPTX::Int128RegsRegClass) {
Op = NVPTX::IMOV128rr;
Op = NVPTX::IMOV128r;
} else if (DestRC == &NVPTX::Float32RegsRegClass) {
Op = (SrcRC == &NVPTX::Float32RegsRegClass ? NVPTX::FMOV32rr
Op = (SrcRC == &NVPTX::Float32RegsRegClass ? NVPTX::FMOV32r
: NVPTX::BITCONVERT_32_I2F);
} else if (DestRC == &NVPTX::Float64RegsRegClass) {
Op = (SrcRC == &NVPTX::Float64RegsRegClass ? NVPTX::FMOV64rr
Op = (SrcRC == &NVPTX::Float64RegsRegClass ? NVPTX::FMOV64r
: NVPTX::BITCONVERT_64_I2F);
} else {
llvm_unreachable("Bad register copy");
Expand Down
89 changes: 37 additions & 52 deletions llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -1945,68 +1945,53 @@ def Wrapper : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;

// Load a memory address into a u32 or u64 register.
def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins ADDR_base:$a),
"mov.u32 \t$dst, $a;",
"mov.b32 \t$dst, $a;",
[(set i32:$dst, (Wrapper tglobaladdr:$a))]>;
def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins ADDR_base:$a),
"mov.u64 \t$dst, $a;",
"mov.b64 \t$dst, $a;",
[(set i64:$dst, (Wrapper tglobaladdr:$a))]>;

// Get pointer to local stack.
let hasSideEffects = false in {
def MOV_DEPOT_ADDR : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
"mov.u32 \t$d, __local_depot$num;", []>;
"mov.b32 \t$d, __local_depot$num;", []>;
def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
"mov.u64 \t$d, __local_depot$num;", []>;
"mov.b64 \t$d, __local_depot$num;", []>;
}


// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
let hasSideEffects=0, isAsCheapAsAMove=1 in {
def IMOV1rr : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
"mov.pred \t$dst, $sss;", []>;
def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
"mov.u16 \t$dst, $sss;", []>;
def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
"mov.u32 \t$dst, $sss;", []>;
def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
"mov.u64 \t$dst, $sss;", []>;
def IMOV128rr : NVPTXInst<(outs Int128Regs:$dst), (ins Int128Regs:$sss),
"mov.b128 \t$dst, $sss;", []>;

def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
"mov.f32 \t$dst, $src;", []>;
def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
"mov.f64 \t$dst, $src;", []>;

def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
"mov.pred \t$dst, $src;",
[(set i1:$dst, imm:$src)]>;
def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
"mov.b16 \t$dst, $src;",
[(set i16:$dst, imm:$src)]>;
def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
"mov.b32 \t$dst, $src;",
[(set i32:$dst, imm:$src)]>;
def IMOV64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
"mov.b64 \t$dst, $src;",
[(set i64:$dst, imm:$src)]>;

def FMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins f16imm:$src),
"mov.b16 \t$dst, $src;",
[(set f16:$dst, fpimm:$src)]>;
def BFMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins bf16imm:$src),
"mov.b16 \t$dst, $src;",
[(set bf16:$dst, fpimm:$src)]>;
def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
"mov.f32 \t$dst, $src;",
[(set f32:$dst, fpimm:$src)]>;
def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
"mov.f64 \t$dst, $src;",
[(set f64:$dst, fpimm:$src)]>;
}

def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOV64ri texternalsym:$dst)>;
let hasSideEffects = false, isAsCheapAsAMove = true in {
// Class for register-to-register moves
class MOVr<RegisterClass RC, string OpStr> :
NVPTXInst<(outs RC:$dst), (ins RC:$src),
"mov." # OpStr # " \t$dst, $src;", []>;

// Class for immediate-to-register moves
class MOVi<RegisterClass RC, string OpStr, ValueType VT, Operand IMMType, SDNode ImmNode> :
NVPTXInst<(outs RC:$dst), (ins IMMType:$src),
"mov." # OpStr # " \t$dst, $src;",
[(set VT:$dst, ImmNode:$src)]>;
}

def IMOV1r : MOVr<Int1Regs, "pred">;
def IMOV1i : MOVi<Int1Regs, "pred", i1, i1imm, imm>;
def MOV16r : MOVr<Int16Regs, "b16">;
def IMOV16i : MOVi<Int16Regs, "b16", i16, i16imm, imm>;
def IMOV32r : MOVr<Int32Regs, "b32">;
def IMOV32i : MOVi<Int32Regs, "b32", i32, i32imm, imm>;
def IMOV64r : MOVr<Int64Regs, "b64">;
def IMOV64i : MOVi<Int64Regs, "b64", i64, i64imm, imm>;
def IMOV128r : MOVr<Int128Regs, "b128">;
def FMOV16i : MOVi<Int16Regs, "b16", f16, f16imm, fpimm>;
def BFMOV16i : MOVi<Int16Regs, "b16", bf16, bf16imm, fpimm>;
def FMOV32r : MOVr<Float32Regs, "b32">;
def FMOV32i : MOVi<Float32Regs, "b32", f32, f32imm, fpimm>;
def FMOV64r : MOVr<Float64Regs, "b64">;
def FMOV64i : MOVi<Float64Regs, "b64", f64, f64imm, fpimm>;

def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32i texternalsym:$dst)>;
def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOV64i texternalsym:$dst)>;

//---- Copy Frame Index ----
def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins ADDR:$addr),
Expand Down Expand Up @@ -2717,8 +2702,8 @@ def ProxyRegI1 : ProxyRegInst<"pred", i1, Int1Regs>;
def ProxyRegI16 : ProxyRegInst<"b16", i16, Int16Regs>;
def ProxyRegI32 : ProxyRegInst<"b32", i32, Int32Regs>;
def ProxyRegI64 : ProxyRegInst<"b64", i64, Int64Regs>;
def ProxyRegF32 : ProxyRegInst<"f32", f32, Float32Regs>;
def ProxyRegF64 : ProxyRegInst<"f64", f64, Float64Regs>;
def ProxyRegF32 : ProxyRegInst<"b32", f32, Float32Regs>;
def ProxyRegF64 : ProxyRegInst<"b64", f64, Float64Regs>;

foreach vt = [f16, bf16] in {
def: Pat<(vt (ProxyReg vt:$src)), (ProxyRegI16 $src)>;
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/NVPTX/atomics-sm70.ll
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
; CHECKPTX62-NEXT: or.b32 %r32, %r31, %r30;
; CHECKPTX62-NEXT: atom.cas.b32 %r6, [%r1], %r54, %r32;
; CHECKPTX62-NEXT: setp.ne.s32 %p1, %r6, %r54;
; CHECKPTX62-NEXT: mov.u32 %r54, %r6;
; CHECKPTX62-NEXT: mov.b32 %r54, %r6;
; CHECKPTX62-NEXT: @%p1 bra $L__BB0_1;
; CHECKPTX62-NEXT: // %bb.2: // %atomicrmw.end44
; CHECKPTX62-NEXT: ld.u32 %r55, [%r1];
Expand All @@ -88,7 +88,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
; CHECKPTX62-NEXT: or.b32 %r37, %r36, %r35;
; CHECKPTX62-NEXT: atom.cas.b32 %r9, [%r1], %r55, %r37;
; CHECKPTX62-NEXT: setp.ne.s32 %p2, %r9, %r55;
; CHECKPTX62-NEXT: mov.u32 %r55, %r9;
; CHECKPTX62-NEXT: mov.b32 %r55, %r9;
; CHECKPTX62-NEXT: @%p2 bra $L__BB0_3;
; CHECKPTX62-NEXT: // %bb.4: // %atomicrmw.end26
; CHECKPTX62-NEXT: and.b32 %r10, %r22, -4;
Expand All @@ -109,7 +109,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
; CHECKPTX62-NEXT: or.b32 %r45, %r44, %r43;
; CHECKPTX62-NEXT: atom.global.cas.b32 %r15, [%r10], %r56, %r45;
; CHECKPTX62-NEXT: setp.ne.s32 %p3, %r15, %r56;
; CHECKPTX62-NEXT: mov.u32 %r56, %r15;
; CHECKPTX62-NEXT: mov.b32 %r56, %r15;
; CHECKPTX62-NEXT: @%p3 bra $L__BB0_5;
; CHECKPTX62-NEXT: // %bb.6: // %atomicrmw.end8
; CHECKPTX62-NEXT: and.b32 %r16, %r23, -4;
Expand All @@ -130,7 +130,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
; CHECKPTX62-NEXT: or.b32 %r53, %r52, %r51;
; CHECKPTX62-NEXT: atom.shared.cas.b32 %r21, [%r16], %r57, %r53;
; CHECKPTX62-NEXT: setp.ne.s32 %p4, %r21, %r57;
; CHECKPTX62-NEXT: mov.u32 %r57, %r21;
; CHECKPTX62-NEXT: mov.b32 %r57, %r21;
; CHECKPTX62-NEXT: @%p4 bra $L__BB0_7;
; CHECKPTX62-NEXT: // %bb.8: // %atomicrmw.end
; CHECKPTX62-NEXT: ret;
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/NVPTX/atomics-sm90.ll
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
; CHECKPTX71-NEXT: or.b32 %r32, %r31, %r30;
; CHECKPTX71-NEXT: atom.relaxed.cas.b32 %r6, [%r1], %r54, %r32;
; CHECKPTX71-NEXT: setp.ne.s32 %p1, %r6, %r54;
; CHECKPTX71-NEXT: mov.u32 %r54, %r6;
; CHECKPTX71-NEXT: mov.b32 %r54, %r6;
; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1;
; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end44
; CHECKPTX71-NEXT: ld.u32 %r55, [%r1];
Expand All @@ -89,7 +89,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35;
; CHECKPTX71-NEXT: atom.relaxed.cas.b32 %r9, [%r1], %r55, %r37;
; CHECKPTX71-NEXT: setp.ne.s32 %p2, %r9, %r55;
; CHECKPTX71-NEXT: mov.u32 %r55, %r9;
; CHECKPTX71-NEXT: mov.b32 %r55, %r9;
; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3;
; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end26
; CHECKPTX71-NEXT: and.b32 %r10, %r22, -4;
Expand All @@ -111,7 +111,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43;
; CHECKPTX71-NEXT: atom.relaxed.global.cas.b32 %r15, [%r10], %r56, %r45;
; CHECKPTX71-NEXT: setp.ne.s32 %p3, %r15, %r56;
; CHECKPTX71-NEXT: mov.u32 %r56, %r15;
; CHECKPTX71-NEXT: mov.b32 %r56, %r15;
; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5;
; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end8
; CHECKPTX71-NEXT: and.b32 %r16, %r23, -4;
Expand All @@ -133,7 +133,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
; CHECKPTX71-NEXT: or.b32 %r53, %r52, %r51;
; CHECKPTX71-NEXT: atom.relaxed.shared.cas.b32 %r21, [%r16], %r57, %r53;
; CHECKPTX71-NEXT: setp.ne.s32 %p4, %r21, %r57;
; CHECKPTX71-NEXT: mov.u32 %r57, %r21;
; CHECKPTX71-NEXT: mov.b32 %r57, %r21;
; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7;
; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end
; CHECKPTX71-NEXT: ret;
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/NVPTX/atomics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
; CHECK-NEXT: membar.sys;
; CHECK-NEXT: atom.cas.b32 %r5, [%rd1], %r16, %r14;
; CHECK-NEXT: setp.ne.s32 %p1, %r5, %r16;
; CHECK-NEXT: mov.u32 %r16, %r5;
; CHECK-NEXT: mov.b32 %r16, %r5;
; CHECK-NEXT: @%p1 bra $L__BB22_1;
; CHECK-NEXT: // %bb.2: // %atomicrmw.end
; CHECK-NEXT: shr.u32 %r15, %r5, %r1;
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ entry:
%buf = alloca [16 x i8], align 4

; CHECK: .local .align 4 .b8 __local_depot0[16]
; CHECK: mov.u64 %SPL
; CHECK: mov.b64 %SPL

; CHECK: ld.param.u64 %rd[[A_REG:[0-9]+]], [kernel_func_param_0]
; CHECK: cvta.to.global.u64 %rd[[A1_REG:[0-9]+]], %rd[[A_REG]]
Expand Down
Loading