@@ -85,12 +85,12 @@ namespace sycl::ext::oneapi {
85
85
int32_t dot_acc(vec<int8_t,4> a, vec<int8_t,4> b, int32_t c);
86
86
int32_t dot_acc(vec<int8_t,4> a, vec<uint8_t,4> b, int32_t c);
87
87
int32_t dot_acc(vec<uint8_t,4> a, vec<int8_t,4> b, int32_t c);
88
- int32_t dot_acc(vec<uint8_t,4> a, vec<uint8_t,4> b, int32_t c);
88
+ uint32_t dot_acc(vec<uint8_t,4> a, vec<uint8_t,4> b, uint32_t c);
89
89
90
- int32_t doc_acc_4x8packed(int32_t a, int32_t b, int32_t c);
91
- int32_t doc_acc_4x8packed(int32_t a, uint32_t b, int32_t c);
92
- int32_t doc_acc_4x8packed (uint32_t a, int32_t b, int32_t c);
93
- int32_t doc_acc_4x8packed (uint32_t a, uint32_t b, int32_t c);
90
+ int32_t doc_acc_4x8packed_ss(uint32_t a, uint32_t b, int32_t c);
91
+ int32_t doc_acc_4x8packed_su(uint32_t a, uint32_t b, int32_t c);
92
+ int32_t doc_acc_4x8packed_us (uint32_t a, uint32_t b, int32_t c);
93
+ uint32_t doc_acc_4x8packed_uu (uint32_t a, uint32_t b, uint32_t c);
94
94
95
95
} // namespace sycl::ext::oneapi
96
96
----
@@ -111,51 +111,71 @@ int32_t dot_acc(vec<int8_t,4> a,
111
111
int32_t dot_acc(vec<uint8_t,4> a,
112
112
vec<int8_t,4> b,
113
113
int32_t c)
114
- int32_t dot_acc(vec<uint8_t,4> a,
115
- vec<uint8_t,4> b,
116
- int32_t c)
114
+ uint32_t dot_acc(vec<uint8_t,4> a,
115
+ vec<uint8_t,4> b,
116
+ uint32_t c)
117
+ ----
118
+
119
+ |Performs a four-component integer dot product accumulate operation. The value
120
+ that is returned is equivalent to `dot(a, b) + c`, where `dot` computes the
121
+ dot product of two vectors.
122
+
123
+ |[source,c]
124
+ ----
125
+ int32_t doc_acc_4x8packed_ss(uint32_t a,
126
+ uint32_t b,
127
+ int32_t c)
128
+ ----
129
+
130
+ |Performs a four-component integer dot product accumulate operation, where
131
+ `a` and `b` are both interpreted as `vec<int8_t,4>`.
132
+
133
+ |[source,c]
134
+ ----
135
+ int32_t doc_acc_4x8packed_su(uint32_t a,
136
+ uint32_t b,
137
+ int32_t c)
117
138
----
118
139
119
- |Performs a four-component integer dot product accumulate operation. +
120
- {blank}
121
- The value that is returned is equivalent to +
122
- {blank}
123
- `dot(a, b) + c`
140
+ |Performs a four-component integer dot product accumulate operation, where
141
+ `a` is interpreted as `vec<int8_t,4>` and `b` is interpreted as
142
+ `vec<uint8_t,4>`.
124
143
125
144
|[source,c]
126
145
----
127
- int32_t doc_acc_4x8packed(int32_t a,
128
- int32_t b,
129
- int32_t c)
130
- int32_t doc_acc_4x8packed(int32_t a,
131
- uint32_t b,
132
- int32_t c)
133
- int32_t doc_acc_4x8packed(uint32_t a,
134
- int32_t b,
135
- int32_t c)
136
- int32_t doc_acc_4x8packed(uint32_t a,
137
- uint32_t b,
138
- int32_t c);
146
+ int32_t doc_acc_4x8packed_us(uint32_t a,
147
+ uint32_t b,
148
+ int32_t c)
139
149
----
140
150
141
151
|Performs a four-component integer dot product accumulate operation, where
142
- `a` and `b` are 32-bit integers that represent a vector of 4 8-bit elements.
143
- When the type of `a` or `b` is `int32_t`, it is interpreted as `vec<int8_t,4>`.
144
- When the type of `a` or `b` is `uint32_t`, it is interpreted as
145
- `vec<uint8_t,4>`. In each case, the least significant byte is element 0, and
146
- the most significant byte is element 3.
152
+ `a` is interpreted as `vec<uint8_t,4>` and `b` is interpreted as
153
+ `vec<int8_t,4>`.
147
154
155
+ |[source,c]
156
+ ----
157
+ uint32_t doc_acc_4x8packed_uu(uint32_t a,
158
+ uint32_t b,
159
+ uint32_t c);
160
+ ----
161
+
162
+ |Performs a four-component integer dot product accumulate operation, where
163
+ `a` and `b` are both interpreted as `vec<uint8_t,4>`.
148
164
|====
149
165
166
+ For all the "packed" overloads, the least significant byte of the integer is
167
+ element 0, and the most significant byte is element 3.
168
+
150
169
=== Deprecated functions
151
170
152
- The following functions are deprecated. They have the same effect as the
153
- `doc_acc_4x8packed` overloads described above.
171
+ The following functions are deprecated.
154
172
155
173
[source,c++]
156
174
----
157
175
namespace sycl::ext::oneapi {
158
176
177
+ int32_t dot_acc(vec<uint8_t,4> a, vec<uint8_t,4> b, int32_t c);
178
+
159
179
int32_t dot_acc(int32_t a, int32_t b, int32_t c);
160
180
int32_t dot_acc(int32_t a, uint32_t b, int32_t c);
161
181
int32_t dot_acc(uint32_t a, int32_t b, int32_t c);
@@ -164,15 +184,63 @@ int32_t dot_acc(uint32_t a, uint32_t b, int32_t c);
164
184
} // namespace sycl::ext::oneapi
165
185
----
166
186
187
+ [cols="4a,4",options="header"]
188
+ |====
189
+ | *Function*
190
+ | *Description*
167
191
168
- == Issues
192
+ |[source,c]
193
+ ----
194
+ int32_t dot_acc(vec<uint8_t,4> a,
195
+ vec<uint8_t,4> b,
196
+ int32_t c)
197
+ ----
198
+
199
+ |Performs a four-component integer dot product accumulate operation, where the
200
+ elements of `a` and `b` are unsigned while `c` is signed. Use the version
201
+ taking an unsigned `c` instead.
202
+
203
+ |[source,c]
204
+ ----
205
+ int32_t dot_acc(int32_t a,
206
+ int32_t b,
207
+ int32_t c)
208
+ ----
169
209
170
- * The overloads that take two unsigned vectors have a signed `c` and return a
171
- signed result. This is inconsistent with the SPIR-V primitives and the
172
- OpenCL C APIs, both of which return an unsigned value in this case and expect
173
- an unsigned `c`. I think we could implement the APIs as they are using the
174
- SPIR-V primitives, but the extra unsigned-to-signed conversions might
175
- generate less efficient code (I haven't checked). Is there a compelling
176
- reason to keep these APIs as they are now? If not, we could deprecate them
177
- and introduce overloads that take an unsigned `c` and return an unsigned
178
- value.
210
+ |Performs a four-component integer dot product accumulate operation, where
211
+ `a` and `b` are both interpreted as `vec<int8_t,4>`. Use
212
+ `doc_acc_4x8packed_ss` instead.
213
+
214
+ |[source,c]
215
+ ----
216
+ int32_t dot_acc(int32_t a,
217
+ uint32_t b,
218
+ int32_t c)
219
+ ----
220
+
221
+ |Performs a four-component integer dot product accumulate operation, where
222
+ `a` is interpreted as `vec<int8_t,4>` and `b` is interpreted as
223
+ `vec<uint8_t,4>`. Use `doc_acc_4x8packed_su` instead.
224
+
225
+ |[source,c]
226
+ ----
227
+ int32_t dot_acc(uint32_t a,
228
+ int32_t b,
229
+ int32_t c)
230
+ ----
231
+
232
+ |Performs a four-component integer dot product accumulate operation, where
233
+ `a` is interpreted as `vec<uint8_t,4>` and `b` is interpreted as
234
+ `vec<int8_t,4>`. Use `doc_acc_4x8packed_us` instead.
235
+
236
+ |[source,c]
237
+ ----
238
+ int32_t dot_acc(uint32_t a,
239
+ uint32_t b,
240
+ int32_t c)
241
+ ----
242
+
243
+ |Performs a four-component integer dot product accumulate operation, where
244
+ `a` and `b` are both interpreted as `vec<uint8_t,4>`. Use
245
+ `doc_acc_4x8packed_uu` instead.
246
+ |====
0 commit comments