Skip to content

Commit 444c468

Browse files
committed
Update design to use separate bit instead of digit
1 parent c3db575 commit 444c468

File tree

4 files changed

+32
-26
lines changed

4 files changed

+32
-26
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -185,19 +185,19 @@ For example, take ``sm_103a`` (10 represents ``X``, 3 represents ``Y``, and ``a`
185185
represents ``z``), ``sm_103f``, and ``sm_103`` architecture variants. The ``sm_103`` is
186186
compatible with ``sm_103a`` and ``sm_103f``, and ``sm_103f`` is compatible with ``sm_103a``.
187187

188-
Encoding := Arch * 100 + 10 (for 'f') + 1 (for 'a')
188+
Encoding := Arch * 10 + 2 (for 'f') + 1 (for 'a')
189189
Arch := X * 10 + Y
190190

191-
For example, ``sm_103a`` is encoded as 10311 (103 * 100 + 10 + 1) and ``sm_103f`` is
192-
encoded as 10310 (103 * 100 + 10).
191+
For example, ``sm_103a`` is encoded as 1033 (103 * 10 + 2 + 1) and ``sm_103f`` is
192+
encoded as 1032 (103 * 10 + 2).
193193

194194
This encoding allows simple partial ordering of the architectures.
195195

196-
* Compare Family and Arch by dividing FullSMVersion by 1000 and 100
196+
* Compare Family and Arch by dividing FullSMVersion by 100 and 10
197197
respectively before the comparison.
198198
* Compare within the family by comparing FullSMVersion, given both belongs to
199199
the same family.
200-
* Detect ``a`` variants by checking FullSMVersion % 10.
200+
* Detect ``a`` variants by checking FullSMVersion & 1.
201201

202202
.. _nvptx_intrinsics:
203203

llvm/lib/Target/NVPTX/NVPTX.td

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -68,33 +68,33 @@ class FeaturePTX<int version>:
6868
// represents 'z'), sm_103f, and sm_103 architecture variants. The sm_103 is
6969
// compatible with sm_103a and sm_103f, and sm_103f is compatible with sm_103a.
7070
//
71-
// Encoding := Arch * 100 + 10 (for 'f') + 1 (for 'a')
71+
// Encoding := Arch * 10 + 2 (for 'f') + 1 (for 'a')
7272
// Arch := X * 10 + Y
7373
//
74-
// For example, sm_103a is encoded as 10311 (103 * 100 + 10 + 1) and sm_103f is
75-
// encoded as 10310 (103 * 100 + 10).
74+
// For example, sm_103a is encoded as 1033 (103 * 10 + 2 + 1) and sm_103f is
75+
// encoded as 1032 (103 * 10 + 2).
7676
//
7777
// This encoding allows simple partial ordering of the architectures.
78-
// + Compare Family and Arch by dividing FullSMVersion by 1000 and 100
78+
// + Compare Family and Arch by dividing FullSMVersion by 100 and 10
7979
// respectively before the comparison.
8080
// + Compare within the family by comparing FullSMVersion, given both belongs to
8181
// the same family.
82-
// + Detect 'a' variants by checking FullSMVersion % 10.
82+
// + Detect 'a' variants by checking FullSMVersion & 1.
8383
foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
8484
60, 61, 62, 70, 72, 75, 80, 86, 87,
8585
89, 90, 100, 101, 103, 120, 121] in {
86-
// Base SM version (e.g. FullSMVersion for sm_100 is 10000)
87-
def SM#sm : FeatureSM<""#sm, !mul(sm, 100)>;
86+
// Base SM version (e.g. FullSMVersion for sm_100 is 1000)
87+
def SM#sm : FeatureSM<""#sm, !mul(sm, 10)>;
8888

8989
// Family-specific targets which are compatible within same family
90-
// (e.g. FullSMVersion for sm_100f is 10010)
90+
// (e.g. FullSMVersion for sm_100f is 1002)
9191
if !ge(sm, 100) then
92-
def SM#sm#f : FeatureSM<""#sm#"f", !add(!mul(sm, 100), 10)>;
92+
def SM#sm#f : FeatureSM<""#sm#"f", !add(!mul(sm, 10), 2)>;
9393

9494
// Architecture-specific targets which are incompatible across architectures
95-
// (e.g. FullSMVersion for sm_100a is 10011)
95+
// (e.g. FullSMVersion for sm_100a is 1003)
9696
if !ge(sm, 90) then
97-
def SM#sm#a : FeatureSM<""#sm#"a", !add(!mul(sm, 100), 11)>;
97+
def SM#sm#a : FeatureSM<""#sm#"a", !add(!mul(sm, 10), 3)>;
9898
}
9999

100100
foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65,

llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ NVPTXSubtarget::NVPTXSubtarget(const Triple &TT, const std::string &CPU,
5555
const std::string &FS,
5656
const NVPTXTargetMachine &TM)
5757
: NVPTXGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), PTXVersion(0),
58-
FullSmVersion(2000), SmVersion(getSmVersion()),
58+
FullSmVersion(200), SmVersion(getSmVersion()),
5959
TLInfo(TM, initializeSubtargetDependencies(CPU, FS)) {
6060
TSInfo = std::make_unique<NVPTXSelectionDAGInfo>();
6161
}

llvm/lib/Target/NVPTX/NVPTXSubtarget.h

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -108,8 +108,8 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
108108
switch (FullSmVersion) {
109109
default:
110110
break;
111-
case 10011: // sm_100a
112-
case 10111: // sm_101a
111+
case 1003: // sm_100a
112+
case 1013: // sm_101a
113113
HasTcgen05 = true;
114114
break;
115115
}
@@ -120,9 +120,15 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
120120
// TMA G2S copy with cta_group::1/2 support
121121
bool hasCpAsyncBulkTensorCTAGroupSupport() const {
122122
// TODO: Update/tidy-up after the family-conditional support arrives
123-
return ((FullSmVersion == 10011 || FullSmVersion == 10111) &&
124-
PTXVersion >= 86) ||
125-
(FullSmVersion == 10311 && PTXVersion >= 88);
123+
switch (FullSmVersion) {
124+
case 1003:
125+
case 1013:
126+
return PTXVersion >= 86;
127+
case 1033:
128+
return PTXVersion >= 88;
129+
default:
130+
return false;
131+
}
126132
}
127133

128134
// Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
@@ -135,24 +141,24 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
135141
bool hasPTXASUnreachableBug() const { return PTXVersion < 83; }
136142
bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; }
137143
unsigned int getFullSmVersion() const { return FullSmVersion; }
138-
unsigned int getSmVersion() const { return getFullSmVersion() / 100; }
144+
unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
139145
// GPUs with "a" suffix have include architecture-accelerated features that
140146
// are supported on the specified architecture only, hence such targets do not
141147
// follow the onion layer model. hasArchAccelFeatures() allows
142148
// distinguishing such GPU variants from the base GPU architecture.
143149
// - false represents non-accelerated architecture.
144150
// - true represents architecture-accelerated variant.
145151
bool hasArchAccelFeatures() const {
146-
return getFullSmVersion() % 10 && PTXVersion >= 80;
152+
return (getFullSmVersion() & 1) && PTXVersion >= 80;
147153
}
148154
// GPUs with 'f' suffix have architecture-accelerated features which are
149155
// portable across all future architectures under same SM major. For example,
150156
// sm_100f features will work for sm_10X*f*/sm_10X*a* future architectures.
151157
// - false represents non-family-specific architecture.
152158
// - true represents family-specific variant.
153159
bool hasFamilySpecificFeatures() const {
154-
return getFullSmVersion() % 100 == 10 ? PTXVersion >= 88
155-
: hasArchAccelFeatures();
160+
return getFullSmVersion() % 10 == 2 ? PTXVersion >= 88
161+
: hasArchAccelFeatures();
156162
}
157163
// If the user did not provide a target we default to the `sm_30` target.
158164
std::string getTargetName() const {

0 commit comments

Comments
 (0)