Skip to content

Commit 89f6a39

Browse files
authored
[libspirv][ptx-nvidiacl] Change __clc__group_scratch size to 32 x i128 (#18431)
To align with the comment in the file that specifies 32 storage locations and 128 bits per warp. Change file to opaque pointer mode. Add more global variables for different sizes to resolve `Reducing storage for small data types`.
1 parent 259dfbe commit 89f6a39

File tree

2 files changed

+49
-99
lines changed

2 files changed

+49
-99
lines changed
Lines changed: 21 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,62 +1,47 @@
11
; 32 storage locations is sufficient for all current-generation AMD GPUs
2-
; 64 bits per wavefront is sufficient for all fundamental data types
3-
; Reducing storage for small data types or increasing it for user-defined types
4-
; will likely require an additional pass to track group algorithm usage
5-
@__clc__group_scratch = internal addrspace(3) global [32 x i64] undef, align 1
2+
@__clc__group_scratch_i1 = internal addrspace(3) global [32 x i1] poison, align 1
3+
@__clc__group_scratch_i8 = internal addrspace(3) global [32 x i8] poison, align 1
4+
@__clc__group_scratch_i16 = internal addrspace(3) global [32 x i16] poison, align 2
5+
@__clc__group_scratch_i32 = internal addrspace(3) global [32 x i32] poison, align 4
6+
@__clc__group_scratch_i64 = internal addrspace(3) global [32 x i64] poison, align 8
67

7-
define i8 addrspace(3)* @__clc__get_group_scratch_bool() nounwind alwaysinline {
8+
define ptr addrspace(3) @__clc__get_group_scratch_bool() nounwind alwaysinline {
89
entry:
9-
%ptr = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
10-
%cast = bitcast i64 addrspace(3)* %ptr to i8 addrspace(3)*
11-
ret i8 addrspace(3)* %cast
10+
ret ptr addrspace(3) @__clc__group_scratch_i1
1211
}
1312

14-
define i8 addrspace(3)* @__clc__get_group_scratch_char() nounwind alwaysinline {
13+
define ptr addrspace(3) @__clc__get_group_scratch_char() nounwind alwaysinline {
1514
entry:
16-
%ptr = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
17-
%cast = bitcast i64 addrspace(3)* %ptr to i8 addrspace(3)*
18-
ret i8 addrspace(3)* %cast
15+
ret ptr addrspace(3) @__clc__group_scratch_i8
1916
}
2017

21-
define i16 addrspace(3)* @__clc__get_group_scratch_short() nounwind alwaysinline {
18+
define ptr addrspace(3) @__clc__get_group_scratch_short() nounwind alwaysinline {
2219
entry:
23-
%ptr = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
24-
%cast = bitcast i64 addrspace(3)* %ptr to i16 addrspace(3)*
25-
ret i16 addrspace(3)* %cast
20+
ret ptr addrspace(3) @__clc__group_scratch_i16
2621
}
2722

28-
define i32 addrspace(3)* @__clc__get_group_scratch_int() nounwind alwaysinline {
23+
define ptr addrspace(3) @__clc__get_group_scratch_int() nounwind alwaysinline {
2924
entry:
30-
%ptr = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
31-
%cast = bitcast i64 addrspace(3)* %ptr to i32 addrspace(3)*
32-
ret i32 addrspace(3)* %cast
25+
ret ptr addrspace(3) @__clc__group_scratch_i32
3326
}
3427

35-
define i64 addrspace(3)* @__clc__get_group_scratch_long() nounwind alwaysinline {
28+
define ptr addrspace(3) @__clc__get_group_scratch_long() nounwind alwaysinline {
3629
entry:
37-
%ptr = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
38-
%cast = bitcast i64 addrspace(3)* %ptr to i64 addrspace(3)*
39-
ret i64 addrspace(3)* %cast
30+
ret ptr addrspace(3) @__clc__group_scratch_i64
4031
}
4132

42-
define half addrspace(3)* @__clc__get_group_scratch_half() nounwind alwaysinline {
33+
define ptr addrspace(3) @__clc__get_group_scratch_half() nounwind alwaysinline {
4334
entry:
44-
%ptr = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
45-
%cast = bitcast i64 addrspace(3)* %ptr to half addrspace(3)*
46-
ret half addrspace(3)* %cast
35+
ret ptr addrspace(3) @__clc__group_scratch_i16
4736
}
4837

49-
define float addrspace(3)* @__clc__get_group_scratch_float() nounwind alwaysinline {
38+
define ptr addrspace(3) @__clc__get_group_scratch_float() nounwind alwaysinline {
5039
entry:
51-
%ptr = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
52-
%cast = bitcast i64 addrspace(3)* %ptr to float addrspace(3)*
53-
ret float addrspace(3)* %cast
40+
ret ptr addrspace(3) @__clc__group_scratch_i32
5441
}
5542

56-
define double addrspace(3)* @__clc__get_group_scratch_double() nounwind alwaysinline {
43+
define ptr addrspace(3) @__clc__get_group_scratch_double() nounwind alwaysinline {
5744
entry:
58-
%ptr = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
59-
%cast = bitcast i64 addrspace(3)* %ptr to double addrspace(3)*
60-
ret double addrspace(3)* %cast
45+
ret ptr addrspace(3) @__clc__group_scratch_i64
6146
}
6247

Lines changed: 28 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,97 +1,62 @@
11
; 32 storage locations is sufficient for all current-generation NVIDIA GPUs
2-
; 128 bits per warp is sufficient for all fundamental data types and complex
3-
; Reducing storage for small data types or increasing it for user-defined types
4-
; will likely require an additional pass to track group algorithm usage
5-
@__clc__group_scratch = internal addrspace(3) global [128 x i64] undef, align 1
2+
@__clc__group_scratch_i1 = internal addrspace(3) global [32 x i1] poison, align 1
3+
@__clc__group_scratch_i8 = internal addrspace(3) global [32 x i8] poison, align 1
4+
@__clc__group_scratch_i16 = internal addrspace(3) global [32 x i16] poison, align 2
5+
@__clc__group_scratch_i32 = internal addrspace(3) global [32 x i32] poison, align 4
6+
@__clc__group_scratch_i64 = internal addrspace(3) global [32 x i64] poison, align 8
7+
@__clc__group_scratch_i128 = internal addrspace(3) global [32 x i128] poison, align 8
68

7-
define i8 addrspace(3)* @__clc__get_group_scratch_bool() nounwind alwaysinline {
9+
define ptr addrspace(3) @__clc__get_group_scratch_bool() nounwind alwaysinline {
810
entry:
9-
%ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
10-
%cast = bitcast i64 addrspace(3)* %ptr to i8 addrspace(3)*
11-
ret i8 addrspace(3)* %cast
11+
ret ptr addrspace(3) @__clc__group_scratch_i1
1212
}
1313

14-
define i8 addrspace(3)* @__clc__get_group_scratch_char() nounwind alwaysinline {
14+
define ptr addrspace(3) @__clc__get_group_scratch_char() nounwind alwaysinline {
1515
entry:
16-
%ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
17-
%cast = bitcast i64 addrspace(3)* %ptr to i8 addrspace(3)*
18-
ret i8 addrspace(3)* %cast
16+
ret ptr addrspace(3) @__clc__group_scratch_i8
1917
}
2018

21-
define i16 addrspace(3)* @__clc__get_group_scratch_short() nounwind alwaysinline {
19+
define ptr addrspace(3) @__clc__get_group_scratch_short() nounwind alwaysinline {
2220
entry:
23-
%ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
24-
%cast = bitcast i64 addrspace(3)* %ptr to i16 addrspace(3)*
25-
ret i16 addrspace(3)* %cast
21+
ret ptr addrspace(3) @__clc__group_scratch_i16
2622
}
2723

28-
define i32 addrspace(3)* @__clc__get_group_scratch_int() nounwind alwaysinline {
24+
define ptr addrspace(3) @__clc__get_group_scratch_int() nounwind alwaysinline {
2925
entry:
30-
%ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
31-
%cast = bitcast i64 addrspace(3)* %ptr to i32 addrspace(3)*
32-
ret i32 addrspace(3)* %cast
26+
ret ptr addrspace(3) @__clc__group_scratch_i32
3327
}
3428

35-
define i64 addrspace(3)* @__clc__get_group_scratch_long() nounwind alwaysinline {
29+
define ptr addrspace(3) @__clc__get_group_scratch_long() nounwind alwaysinline {
3630
entry:
37-
%ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
38-
%cast = bitcast i64 addrspace(3)* %ptr to i64 addrspace(3)*
39-
ret i64 addrspace(3)* %cast
31+
ret ptr addrspace(3) @__clc__group_scratch_i64
4032
}
4133

42-
define half addrspace(3)* @__clc__get_group_scratch_half() nounwind alwaysinline {
34+
define ptr addrspace(3) @__clc__get_group_scratch_half() nounwind alwaysinline {
4335
entry:
44-
%ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
45-
%cast = bitcast i64 addrspace(3)* %ptr to half addrspace(3)*
46-
ret half addrspace(3)* %cast
36+
ret ptr addrspace(3) @__clc__group_scratch_i16
4737
}
4838

49-
define float addrspace(3)* @__clc__get_group_scratch_float() nounwind alwaysinline {
39+
define ptr addrspace(3) @__clc__get_group_scratch_float() nounwind alwaysinline {
5040
entry:
51-
%ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
52-
%cast = bitcast i64 addrspace(3)* %ptr to float addrspace(3)*
53-
ret float addrspace(3)* %cast
41+
ret ptr addrspace(3) @__clc__group_scratch_i32
5442
}
5543

56-
define double addrspace(3)* @__clc__get_group_scratch_double() nounwind alwaysinline {
44+
define ptr addrspace(3) @__clc__get_group_scratch_double() nounwind alwaysinline {
5745
entry:
58-
%ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
59-
%cast = bitcast i64 addrspace(3)* %ptr to double addrspace(3)*
60-
ret double addrspace(3)* %cast
46+
ret ptr addrspace(3) @__clc__group_scratch_i64
6147
}
6248

63-
%complex_half = type {
64-
half,
65-
half
66-
}
67-
68-
%complex_float = type {
69-
float,
70-
float
71-
}
72-
73-
%complex_double = type {
74-
double,
75-
double
76-
}
77-
78-
define %complex_half addrspace(3)* @__clc__get_group_scratch_complex_half() nounwind alwaysinline {
49+
define ptr addrspace(3) @__clc__get_group_scratch_complex_half() nounwind alwaysinline {
7950
entry:
80-
%ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
81-
%cast = bitcast i64 addrspace(3)* %ptr to %complex_half addrspace(3)*
82-
ret %complex_half addrspace(3)* %cast
51+
ret ptr addrspace(3) @__clc__group_scratch_i32
8352
}
8453

85-
define %complex_float addrspace(3)* @__clc__get_group_scratch_complex_float() nounwind alwaysinline {
54+
define ptr addrspace(3) @__clc__get_group_scratch_complex_float() nounwind alwaysinline {
8655
entry:
87-
%ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
88-
%cast = bitcast i64 addrspace(3)* %ptr to %complex_float addrspace(3)*
89-
ret %complex_float addrspace(3)* %cast
56+
ret ptr addrspace(3) @__clc__group_scratch_i64
9057
}
9158

92-
define %complex_double addrspace(3)* @__clc__get_group_scratch_complex_double() nounwind alwaysinline {
59+
define ptr addrspace(3) @__clc__get_group_scratch_complex_double() nounwind alwaysinline {
9360
entry:
94-
%ptr = getelementptr inbounds [128 x i64], [128 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
95-
%cast = bitcast i64 addrspace(3)* %ptr to %complex_double addrspace(3)*
96-
ret %complex_double addrspace(3)* %cast
61+
ret ptr addrspace(3) @__clc__group_scratch_i128
9762
}

0 commit comments

Comments
 (0)