|
3 | 3 |
|
4 | 4 | using System;
|
5 | 5 | using System.Buffers;
|
| 6 | +using System.Diagnostics; |
6 | 7 | using System.IO.Pipelines;
|
7 | 8 | using System.Runtime.CompilerServices;
|
8 | 9 | using System.Runtime.InteropServices;
|
| 10 | +using System.Runtime.Intrinsics.X86; |
9 | 11 |
|
10 | 12 | namespace System.Buffers
|
11 | 13 | {
|
@@ -174,87 +176,137 @@ private unsafe static void WriteAsciiMultiWrite(ref this BufferWriter<PipeWriter
|
174 | 176 | private static unsafe void EncodeAsciiCharsToBytes(char* input, byte* output, int length)
|
175 | 177 | {
|
176 | 178 | // Note: Not BIGENDIAN or check for non-ascii
|
177 |
| - const int Shift16Shift24 = (1 << 16) | (1 << 24); |
178 |
| - const int Shift8Identity = (1 << 8) | (1); |
179 |
| - |
180 |
| - // Encode as bytes up to the first non-ASCII byte and return count encoded |
181 |
| - int i = 0; |
182 |
| - // Use Intrinsic switch |
183 |
| - if (IntPtr.Size == 8) // 64 bit |
| 179 | + if (Bmi2.IsSupported) |
184 | 180 | {
|
185 |
| - if (length < 4) goto trailing; |
186 |
| - |
187 |
| - int unaligned = (int)(((ulong)input) & 0x7) >> 1; |
188 |
| - // Unaligned chars |
189 |
| - for (; i < unaligned; i++) |
| 181 | + if (length < 4) |
190 | 182 | {
|
191 |
| - char ch = *(input + i); |
192 |
| - *(output + i) = (byte)ch; // Cast convert |
| 183 | + // Convert the chars to bytes one by one if there are less than 4. |
| 184 | + for (int i = 0; i < length; i++) |
| 185 | + { |
| 186 | + char ch = input[i]; |
| 187 | + output[i] = (byte)ch; // Cast convert |
| 188 | + } |
193 | 189 | }
|
194 |
| - |
195 |
| - // Aligned |
196 |
| - int ulongDoubleCount = (length - i) & ~0x7; |
197 |
| - for (; i < ulongDoubleCount; i += 8) |
| 190 | + else if (Bmi2.X64.IsSupported) // 64-bit, 4+ chars |
198 | 191 | {
|
199 |
| - ulong inputUlong0 = *(ulong*)(input + i); |
200 |
| - ulong inputUlong1 = *(ulong*)(input + i + 4); |
201 |
| - // Pack 16 ASCII chars into 16 bytes |
202 |
| - *(uint*)(output + i) = |
203 |
| - ((uint)((inputUlong0 * Shift16Shift24) >> 24) & 0xffff) | |
204 |
| - ((uint)((inputUlong0 * Shift8Identity) >> 24) & 0xffff0000); |
205 |
| - *(uint*)(output + i + 4) = |
206 |
| - ((uint)((inputUlong1 * Shift16Shift24) >> 24) & 0xffff) | |
207 |
| - ((uint)((inputUlong1 * Shift8Identity) >> 24) & 0xffff0000); |
| 192 | + // Convert all the 4 char sequences, except final 1 - 4 char sequence. |
| 193 | + int firstLength = length - sizeof(int); |
| 194 | + Debug.Assert(firstLength >= 0); |
| 195 | + for (int i = 0; i < firstLength; i += sizeof(int)) |
| 196 | + { |
| 197 | + *(uint*)(output + i) = (uint)Bmi2.X64.ParallelBitExtract( |
| 198 | + *(ulong*)(input + i), |
| 199 | + 0x00FF00FF_00FF00FFul); |
| 200 | + } |
| 201 | + |
| 202 | + // Convert the final sequence of 4 from the end. |
| 203 | + // This may overlap with the last sequence of the loop, if length is not a multiple of 4. |
| 204 | + *(uint*)(output + firstLength) = (uint)Bmi2.X64.ParallelBitExtract( |
| 205 | + *(ulong*)(input + firstLength), |
| 206 | + 0x00FF00FF_00FF00FFul); |
208 | 207 | }
|
209 |
| - if (length - 4 > i) |
| 208 | + else // 32-bit, 4+ chars |
210 | 209 | {
|
211 |
| - ulong inputUlong = *(ulong*)(input + i); |
212 |
| - // Pack 8 ASCII chars into 8 bytes |
213 |
| - *(uint*)(output + i) = |
214 |
| - ((uint)((inputUlong * Shift16Shift24) >> 24) & 0xffff) | |
215 |
| - ((uint)((inputUlong * Shift8Identity) >> 24) & 0xffff0000); |
216 |
| - i += 4; |
217 |
| - } |
| 210 | + // Convert all the 2 char sequences, except final 1 - 2 char sequence |
| 211 | + int firstLength = length - sizeof(ushort); |
| 212 | + Debug.Assert(firstLength >= 0); |
| 213 | + for (int i = 0; i < firstLength; i += sizeof(ushort)) |
| 214 | + { |
| 215 | + *(ushort*)(output + i) = (ushort)Bmi2.ParallelBitExtract( |
| 216 | + *(uint*)(input + i), |
| 217 | + 0x00FF00FFu); |
| 218 | + } |
218 | 219 |
|
219 |
| - trailing: |
220 |
| - for (; i < length; i++) |
221 |
| - { |
222 |
| - char ch = *(input + i); |
223 |
| - *(output + i) = (byte)ch; // Cast convert |
| 220 | + // Convert the final sequence of 2 from the end. |
| 221 | + // This may overlap with the last sequence of the loop, if length is not a multiple of 2 |
| 222 | + *(ushort*)(output + firstLength) = (ushort)Bmi2.ParallelBitExtract( |
| 223 | + *(uint*)(input + firstLength), |
| 224 | + 0x00FF00FFu); |
224 | 225 | }
|
225 | 226 | }
|
226 |
| - else // 32 bit |
| 227 | + else |
227 | 228 | {
|
228 |
| - // Unaligned chars |
229 |
| - if ((unchecked((int)input) & 0x2) != 0) |
230 |
| - { |
231 |
| - char ch = *input; |
232 |
| - i = 1; |
233 |
| - *(output) = (byte)ch; // Cast convert |
234 |
| - } |
| 229 | + const int Shift16Shift24 = (1 << 16) | (1 << 24); |
| 230 | + const int Shift8Identity = (1 << 8) | (1); |
235 | 231 |
|
236 |
| - // Aligned |
237 |
| - int uintCount = (length - i) & ~0x3; |
238 |
| - for (; i < uintCount; i += 4) |
| 232 | + int i = 0; |
| 233 | + // Use Intrinsic switch |
| 234 | + if (IntPtr.Size == 8) // 64 bit |
239 | 235 | {
|
240 |
| - uint inputUint0 = *(uint*)(input + i); |
241 |
| - uint inputUint1 = *(uint*)(input + i + 2); |
242 |
| - // Pack 4 ASCII chars into 4 bytes |
243 |
| - *(ushort*)(output + i) = (ushort)(inputUint0 | (inputUint0 >> 8)); |
244 |
| - *(ushort*)(output + i + 2) = (ushort)(inputUint1 | (inputUint1 >> 8)); |
| 236 | + if (length < 4) goto trailing; |
| 237 | + |
| 238 | + int unaligned = (int)(((ulong)input) & 0x7) >> 1; |
| 239 | + // Unaligned chars |
| 240 | + for (; i < unaligned; i++) |
| 241 | + { |
| 242 | + char ch = input[i]; |
| 243 | + output[i] = (byte)ch; // Cast convert |
| 244 | + } |
| 245 | + |
| 246 | + // Aligned |
| 247 | + int ulongDoubleCount = (length - i) & ~0x7; |
| 248 | + for (; i < ulongDoubleCount; i += 8) |
| 249 | + { |
| 250 | + ulong inputUlong0 = *(ulong*)(input + i); |
| 251 | + ulong inputUlong1 = *(ulong*)(input + i + 4); |
| 252 | + // Pack 16 ASCII chars into 16 bytes |
| 253 | + *(uint*)(output + i) = |
| 254 | + ((uint)((inputUlong0 * Shift16Shift24) >> 24) & 0xffff) | |
| 255 | + ((uint)((inputUlong0 * Shift8Identity) >> 24) & 0xffff0000); |
| 256 | + *(uint*)(output + i + 4) = |
| 257 | + ((uint)((inputUlong1 * Shift16Shift24) >> 24) & 0xffff) | |
| 258 | + ((uint)((inputUlong1 * Shift8Identity) >> 24) & 0xffff0000); |
| 259 | + } |
| 260 | + if (length - 4 > i) |
| 261 | + { |
| 262 | + ulong inputUlong = *(ulong*)(input + i); |
| 263 | + // Pack 8 ASCII chars into 8 bytes |
| 264 | + *(uint*)(output + i) = |
| 265 | + ((uint)((inputUlong * Shift16Shift24) >> 24) & 0xffff) | |
| 266 | + ((uint)((inputUlong * Shift8Identity) >> 24) & 0xffff0000); |
| 267 | + i += 4; |
| 268 | + } |
| 269 | + |
| 270 | + trailing: |
| 271 | + for (; i < length; i++) |
| 272 | + { |
| 273 | + char ch = input[i]; |
| 274 | + output[i] = (byte)ch; // Cast convert |
| 275 | + } |
245 | 276 | }
|
246 |
| - if (length - 1 > i) |
| 277 | + else // 32 bit |
247 | 278 | {
|
248 |
| - uint inputUint = *(uint*)(input + i); |
249 |
| - // Pack 2 ASCII chars into 2 bytes |
250 |
| - *(ushort*)(output + i) = (ushort)(inputUint | (inputUint >> 8)); |
251 |
| - i += 2; |
252 |
| - } |
| 279 | + // Unaligned chars |
| 280 | + if ((unchecked((int)input) & 0x2) != 0) |
| 281 | + { |
| 282 | + char ch = *input; |
| 283 | + i = 1; |
| 284 | + output[0] = (byte)ch; // Cast convert |
| 285 | + } |
253 | 286 |
|
254 |
| - if (i < length) |
255 |
| - { |
256 |
| - char ch = *(input + i); |
257 |
| - *(output + i) = (byte)ch; // Cast convert |
| 287 | + // Aligned |
| 288 | + int uintCount = (length - i) & ~0x3; |
| 289 | + for (; i < uintCount; i += 4) |
| 290 | + { |
| 291 | + uint inputUint0 = *(uint*)(input + i); |
| 292 | + uint inputUint1 = *(uint*)(input + i + 2); |
| 293 | + // Pack 4 ASCII chars into 4 bytes |
| 294 | + *(ushort*)(output + i) = (ushort)(inputUint0 | (inputUint0 >> 8)); |
| 295 | + *(ushort*)(output + i + 2) = (ushort)(inputUint1 | (inputUint1 >> 8)); |
| 296 | + } |
| 297 | + if (length - 1 > i) |
| 298 | + { |
| 299 | + uint inputUint = *(uint*)(input + i); |
| 300 | + // Pack 2 ASCII chars into 2 bytes |
| 301 | + *(ushort*)(output + i) = (ushort)(inputUint | (inputUint >> 8)); |
| 302 | + i += 2; |
| 303 | + } |
| 304 | + |
| 305 | + if (i < length) |
| 306 | + { |
| 307 | + char ch = input[i]; |
| 308 | + output[i] = (byte)ch; // Cast convert |
| 309 | + } |
258 | 310 | }
|
259 | 311 | }
|
260 | 312 | }
|
|
0 commit comments