Skip to content

Commit 6109e70

Browse files
committed
[llvm][APFloat] Add NaN-in-negative-zero formats by AMD and GraphCore
AMD, GraphCore, and Qualcom have published a standard for 8-bit floats that differs from the 8-bit floats defined by Nvidia, Intel, and ARM. This commit adds support for these alternate 8-bit floats to APFloat in order to enable their usage in MLIR. These formats are presented in the paper at https://arxiv.org/abs/2206.02915 and are implemented in GRaphCore hardware whose ISA is available at https://docs.graphcore.ai/projects/isa-mk2-with-fp8/en/latest/_static/TileVertexISA-IPU21-1.3.1.pdf . In these formats, like the existing Float8E4M3FN, there are no infinity values and there is only one NaN. Unlike in that format, however, the NaN values is 0x80, which would be negative 0 in IEEE formats. This means that these formats also make 0 unsigned. To allow for these new variant semantics, this commit adds fltNanEncoding, which can be IEEE (the default), AllOnes (used by Fleat8E4M3FN), or NegativeZero (used by the new formats, Float8E5M2FNUZ and Float8E4M3FNUZ). Normalization, arithmetic, and other such routines have been updated to account for the potential variant semantics. The two new formats are Float8E5M2FNUZ (5 bits exponent, 2 bits mantissa, finite, unsigned zero) and Float8E4M3FNUZ (4 bits exponent, 3 bits mantissa, finite, unsigned zero). Reviewed By: jakeh-gc, reedwm, lattner Differential Revision: https://reviews.llvm.org/D141863
1 parent 848c700 commit 6109e70

File tree

3 files changed

+1141
-150
lines changed

3 files changed

+1141
-150
lines changed

llvm/include/llvm/ADT/APFloat.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,11 +158,26 @@ struct APFloatBase {
158158
// 8-bit floating point number following IEEE-754 conventions with bit
159159
// layout S1E5M2 as described in https://arxiv.org/abs/2209.05433.
160160
S_Float8E5M2,
161+
// 8-bit floating point number mostly following IEEE-754 conventions
162+
// and bit layout S1E5M2 described in https://arxiv.org/abs/2206.02915,
163+
// with expanded range and with no infinity or signed zero.
164+
// NaN is represnted as negative zero. (FN -> Finite, UZ -> unsigned zero).
165+
// This format's exponent bias is 16, instead of the 15 (2 ** (5 - 1) - 1)
166+
// that IEEE precedent would imply.
167+
S_Float8E5M2FNUZ,
161168
// 8-bit floating point number mostly following IEEE-754 conventions with
162169
// bit layout S1E4M3 as described in https://arxiv.org/abs/2209.05433.
163170
// Unlike IEEE-754 types, there are no infinity values, and NaN is
164171
// represented with the exponent and mantissa bits set to all 1s.
165172
S_Float8E4M3FN,
173+
// 8-bit floating point number mostly following IEEE-754 conventions
174+
// and bit layout S1E4M3 described in https://arxiv.org/abs/2206.02915,
175+
// with expanded range and with no infinity or signed zero.
176+
// NaN is represnted as negative zero. (FN -> Finite, UZ -> unsigned zero).
177+
// This format's exponent bias is 8, instead of the 7 (2 ** (4 - 1) - 1)
178+
// that IEEE precedent would imply.
179+
S_Float8E4M3FNUZ,
180+
166181
S_x87DoubleExtended,
167182
S_MaxSemantics = S_x87DoubleExtended,
168183
};
@@ -177,7 +192,9 @@ struct APFloatBase {
177192
static const fltSemantics &IEEEquad() LLVM_READNONE;
178193
static const fltSemantics &PPCDoubleDouble() LLVM_READNONE;
179194
static const fltSemantics &Float8E5M2() LLVM_READNONE;
195+
static const fltSemantics &Float8E5M2FNUZ() LLVM_READNONE;
180196
static const fltSemantics &Float8E4M3FN() LLVM_READNONE;
197+
static const fltSemantics &Float8E4M3FNUZ() LLVM_READNONE;
181198
static const fltSemantics &x87DoubleExtended() LLVM_READNONE;
182199

183200
/// A Pseudo fltsemantic used to construct APFloats that cannot conflict with
@@ -570,7 +587,9 @@ class IEEEFloat final : public APFloatBase {
570587
APInt convertF80LongDoubleAPFloatToAPInt() const;
571588
APInt convertPPCDoubleDoubleAPFloatToAPInt() const;
572589
APInt convertFloat8E5M2APFloatToAPInt() const;
590+
APInt convertFloat8E5M2FNUZAPFloatToAPInt() const;
573591
APInt convertFloat8E4M3FNAPFloatToAPInt() const;
592+
APInt convertFloat8E4M3FNUZAPFloatToAPInt() const;
574593
void initFromAPInt(const fltSemantics *Sem, const APInt &api);
575594
void initFromHalfAPInt(const APInt &api);
576595
void initFromBFloatAPInt(const APInt &api);
@@ -580,7 +599,9 @@ class IEEEFloat final : public APFloatBase {
580599
void initFromF80LongDoubleAPInt(const APInt &api);
581600
void initFromPPCDoubleDoubleAPInt(const APInt &api);
582601
void initFromFloat8E5M2APInt(const APInt &api);
602+
void initFromFloat8E5M2FNUZAPInt(const APInt &api);
583603
void initFromFloat8E4M3FNAPInt(const APInt &api);
604+
void initFromFloat8E4M3FNUZAPInt(const APInt &api);
584605

585606
void assign(const IEEEFloat &);
586607
void copySignificand(const IEEEFloat &);

0 commit comments

Comments
 (0)