Skip to content

Commit c2c2eb7

Browse files
committed
AMDGPU: Add D16 instructions preserve unused bits feature
- Predicate D16 patterns on this new feature - Added this new feature to gfx900/2/4 Differential Revision: https://reviews.llvm.org/D46366 llvm-svn: 331551
1 parent e1c7a46 commit c2c2eb7

File tree

9 files changed

+388
-323
lines changed

9 files changed

+388
-323
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,13 @@ def FeatureDLInsts : SubtargetFeature<"dl-insts",
322322
"Has deep learning instructions"
323323
>;
324324

325+
def FeatureD16PreservesUnusedBits : SubtargetFeature<
326+
"d16-preserves-unused-bits",
327+
"D16PreservesUnusedBits",
328+
"true",
329+
"D16 memory instructions preserve unused bits rather than zeroing them out"
330+
>;
331+
325332
//===------------------------------------------------------------===//
326333
// Subtarget Features (options and debugging)
327334
//===------------------------------------------------------------===//
@@ -608,20 +615,23 @@ def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
608615
def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0,
609616
[FeatureGFX9,
610617
FeatureMadMixInsts,
611-
FeatureLDSBankCount32
618+
FeatureLDSBankCount32,
619+
FeatureD16PreservesUnusedBits
612620
]>;
613621

614622
def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2,
615623
[FeatureGFX9,
616624
FeatureMadMixInsts,
617625
FeatureLDSBankCount32,
618-
FeatureXNACK
626+
FeatureXNACK,
627+
FeatureD16PreservesUnusedBits
619628
]>;
620629

621630
def FeatureISAVersion9_0_4 : SubtargetFeatureISAVersion <9,0,4,
622631
[FeatureGFX9,
623632
FeatureLDSBankCount32,
624-
FeatureFmaMixInsts]>;
633+
FeatureFmaMixInsts,
634+
FeatureD16PreservesUnusedBits]>;
625635

626636
def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6,
627637
[FeatureGFX9,
@@ -769,6 +779,8 @@ def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">,
769779
def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">,
770780
AssemblerPredicate<"!FeatureUnpackedD16VMem">;
771781

782+
def D16PreservesUnusedBits : Predicate<"Subtarget->d16PreservesUnusedBits()">,
783+
AssemblerPredicate<"FeatureD16PreservesUnusedBits">;
772784

773785
def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">;
774786
def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">;

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
162162
HasSDWAOutModsVOPC(false),
163163
HasDPP(false),
164164
HasDLInsts(false),
165+
D16PreservesUnusedBits(false),
165166
FlatAddressSpace(false),
166167
FlatInstOffsets(false),
167168
FlatGlobalInsts(false),

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
166166
bool HasSDWAOutModsVOPC;
167167
bool HasDPP;
168168
bool HasDLInsts;
169+
bool D16PreservesUnusedBits;
169170
bool FlatAddressSpace;
170171
bool FlatInstOffsets;
171172
bool FlatGlobalInsts;
@@ -546,6 +547,10 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
546547
return HasDLInsts;
547548
}
548549

550+
bool d16PreservesUnusedBits() const {
551+
return D16PreservesUnusedBits;
552+
}
553+
549554
/// Returns the offset in bytes from the start of the input buffer
550555
/// of the first explicit kernel argument.
551556
unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {

llvm/lib/Target/AMDGPU/BUFInstructions.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1374,7 +1374,7 @@ defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, i
13741374
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>;
13751375
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>;
13761376

1377-
let OtherPredicates = [HasD16LoadStore] in {
1377+
let OtherPredicates = [D16PreservesUnusedBits] in {
13781378
defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, i16, load_private>;
13791379
defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, i16, az_extloadi8_private>;
13801380
defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, i16, sextloadi8_private>;
@@ -1489,7 +1489,7 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, BUFFER_STORE_DWORDX2_OF
14891489
defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private>;
14901490

14911491

1492-
let OtherPredicates = [HasD16LoadStore] in {
1492+
let OtherPredicates = [D16PreservesUnusedBits] in {
14931493
// Hiding the extract high pattern in the PatFrag seems to not
14941494
// automatically increase the complexity.
14951495
let AddedComplexity = 1 in {

llvm/lib/Target/AMDGPU/DSInstructions.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -655,7 +655,7 @@ defm : DSReadPat_mc <DS_READ_B128, v4i32, "load_align16_local">;
655655

656656
} // End AddedComplexity = 100
657657

658-
let OtherPredicates = [HasD16LoadStore] in {
658+
let OtherPredicates = [D16PreservesUnusedBits] in {
659659
let AddedComplexity = 100 in {
660660
defm : DSReadPat_Hi16<DS_READ_U16_D16_HI, load_local>;
661661
defm : DSReadPat_Hi16<DS_READ_U8_D16_HI, az_extloadi8_local>;
@@ -689,7 +689,7 @@ defm : DSWritePat_mc <DS_WRITE_B8, i16, "truncstorei8_local">;
689689
defm : DSWritePat_mc <DS_WRITE_B16, i16, "store_local">;
690690
defm : DSWritePat_mc <DS_WRITE_B32, i32, "store_local">;
691691

692-
let OtherPredicates = [HasD16LoadStore] in {
692+
let OtherPredicates = [D16PreservesUnusedBits] in {
693693
def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_local_hi16>;
694694
def : DSWritePat <DS_WRITE_B8_D16_HI, i32, truncstorei8_local_hi16>;
695695
}

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -780,7 +780,7 @@ def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>;
780780
def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
781781
def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
782782

783-
let OtherPredicates = [HasD16LoadStore] in {
783+
let OtherPredicates = [D16PreservesUnusedBits] in {
784784
def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
785785
def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
786786

@@ -824,7 +824,7 @@ def : FlatStoreSignedPat <GLOBAL_STORE_DWORD, store_global, i32>;
824824
def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, v2i32>;
825825
def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX4, store_global, v4i32>;
826826

827-
let OtherPredicates = [HasD16LoadStore] in {
827+
let OtherPredicates = [D16PreservesUnusedBits] in {
828828
def : FlatStoreSignedPat <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>;
829829
def : FlatStoreSignedPat <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>;
830830

0 commit comments

Comments
 (0)