Skip to content

Commit 0ac0d2a

Browse files
[BACKEND] Remove special handling for bf16 in fp->int, int->fp handling (#4281)
This PR removes some special handling for int->bf16 and bf16->int conversions in the TritonNVIDIAGPU->LLVM lowerings, in order to support, e.g. `cvt.bf16.s32` and `cvt.s32.bf16` instructions that are now available on Hopper. Before this PR - there was some special handling for conversions to and from bf16; for int->bf16, the conversion would be done as a int->fp32 followed by fp32->bf16. Presumably, this was done because, before sm90, the ptx "cvt" instruction doesn't support conversions to/from bf16. However, sm90 _does_ support direct conversions to/from bf16; so this PR removes this special handling in order to make use of the direct cvt instructions. For Ampere, it looks like the special handling is no longer needed and llvm handles the details of different hardware implementations (perhaps thanks to llvm/llvm-project#74827?) The core Triton is a small number of people, and we receive many PRs (thank you!). To help us review your code more quickly, **if you are a new contributor (less than 3 PRs merged) we ask that you complete the following tasks and include the filled-out checklist in your PR description.** Complete the following tasks before sending your PR, and replace `[ ]` with `[x]` to indicate you have done them. - [x] I am not making a trivial change, such as fixing a typo in a comment. - [x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [x] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [ ] This PR does not need a test because `FILL THIS IN`. - Select one of the following. - [ ] I have not added any `lit` tests. - [x] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.)
1 parent b674269 commit 0ac0d2a

File tree

2 files changed

+25
-11
lines changed

2 files changed

+25
-11
lines changed

test/Conversion/tritongpu_to_llvm.mlir

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1643,3 +1643,27 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
16431643
tt.return
16441644
}
16451645
}
1646+
1647+
// -----
1648+
1649+
#blocked = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
1650+
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
1651+
tt.func @int32_to_bf16(%arg0: tensor<256xi32, #blocked>) attributes {noinline = false} {
1652+
// CHECK-LABEL: @int32_to_bf16
1653+
// CHECK: llvm.sitofp %{{.*}} : i32 to bf16
1654+
%a = arith.sitofp %arg0 : tensor<256xi32, #blocked> to tensor<256xbf16, #blocked>
1655+
tt.return
1656+
}
1657+
}
1658+
1659+
// -----
1660+
1661+
#blocked = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
1662+
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
1663+
tt.func @bf16_to_int32(%arg0: tensor<256xbf16, #blocked>) attributes {noinline = false} {
1664+
// CHECK-LABEL: @bf16_to_int32
1665+
// CHECK: llvm.fptosi %{{.*}} : bf16 to i32
1666+
%a = arith.fptosi %arg0 : tensor<256xbf16, #blocked> to tensor<256xi32, #blocked>
1667+
tt.return
1668+
}
1669+
}

third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ElementwiseOpToLLVM.cpp

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -664,10 +664,6 @@ struct SIToFPOpConversion
664664
auto outVals = cvtFunc(loc, rewriter, inVals);
665665
assert(outVals.size() == 4);
666666
return outVals;
667-
} else if (outElemTy.isBF16()) {
668-
auto value = rewriter.create<LLVM::SIToFPOp>(loc, f32_ty, operands[0][0]);
669-
return {FpToFpOpConversion::convertFp32ToBf16(loc, rewriter, value,
670-
RoundingMode::RTNE)};
671667
} else {
672668
return {rewriter.create<LLVM::SIToFPOp>(loc, elemTy, operands[0][0])};
673669
}
@@ -685,13 +681,7 @@ struct FPToSIOpConversion
685681
Type elemTy, MultipleOperandsRange operands,
686682
Location loc) const {
687683
auto inElemTy = getElementType(op.getIn());
688-
if (inElemTy.isBF16()) {
689-
auto value =
690-
FpToFpOpConversion::convertBf16ToFp32(loc, rewriter, operands[0][0]);
691-
return {rewriter.create<LLVM::FPToSIOp>(loc, elemTy, value)};
692-
} else {
693-
return {rewriter.create<LLVM::FPToSIOp>(loc, elemTy, operands[0][0])};
694-
}
684+
return {rewriter.create<LLVM::FPToSIOp>(loc, elemTy, operands[0][0])};
695685
}
696686
};
697687

0 commit comments

Comments
 (0)