Skip to content

Commit 5583d91

Browse files
committed
Use ParallelBitExtract for EncodeAsciiCharsToBytes
1 parent 44e4493 commit 5583d91

File tree

1 file changed

+118
-66
lines changed

1 file changed

+118
-66
lines changed

src/Shared/ServerInfrastructure/BufferExtensions.cs

Lines changed: 118 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,11 @@
33

44
using System;
55
using System.Buffers;
6+
using System.Diagnostics;
67
using System.IO.Pipelines;
78
using System.Runtime.CompilerServices;
89
using System.Runtime.InteropServices;
10+
using System.Runtime.Intrinsics.X86;
911

1012
namespace System.Buffers
1113
{
@@ -174,87 +176,137 @@ private unsafe static void WriteAsciiMultiWrite(ref this BufferWriter<PipeWriter
174176
private static unsafe void EncodeAsciiCharsToBytes(char* input, byte* output, int length)
175177
{
176178
// Note: Not BIGENDIAN or check for non-ascii
177-
const int Shift16Shift24 = (1 << 16) | (1 << 24);
178-
const int Shift8Identity = (1 << 8) | (1);
179-
180-
// Encode as bytes up to the first non-ASCII byte and return count encoded
181-
int i = 0;
182-
// Use Intrinsic switch
183-
if (IntPtr.Size == 8) // 64 bit
179+
if (Bmi2.IsSupported)
184180
{
185-
if (length < 4) goto trailing;
186-
187-
int unaligned = (int)(((ulong)input) & 0x7) >> 1;
188-
// Unaligned chars
189-
for (; i < unaligned; i++)
181+
if (length < 4)
190182
{
191-
char ch = *(input + i);
192-
*(output + i) = (byte)ch; // Cast convert
183+
// Convert the chars to bytes one by one if there are less than 4.
184+
for (int i = 0; i < length; i++)
185+
{
186+
char ch = input[i];
187+
output[i] = (byte)ch; // Cast convert
188+
}
193189
}
194-
195-
// Aligned
196-
int ulongDoubleCount = (length - i) & ~0x7;
197-
for (; i < ulongDoubleCount; i += 8)
190+
else if (Bmi2.X64.IsSupported) // 64-bit, 4+ chars
198191
{
199-
ulong inputUlong0 = *(ulong*)(input + i);
200-
ulong inputUlong1 = *(ulong*)(input + i + 4);
201-
// Pack 16 ASCII chars into 16 bytes
202-
*(uint*)(output + i) =
203-
((uint)((inputUlong0 * Shift16Shift24) >> 24) & 0xffff) |
204-
((uint)((inputUlong0 * Shift8Identity) >> 24) & 0xffff0000);
205-
*(uint*)(output + i + 4) =
206-
((uint)((inputUlong1 * Shift16Shift24) >> 24) & 0xffff) |
207-
((uint)((inputUlong1 * Shift8Identity) >> 24) & 0xffff0000);
192+
// Convert all the 4 char sequences, except final 1 - 4 char sequence.
193+
int firstLength = length - sizeof(int);
194+
Debug.Assert(firstLength >= 0);
195+
for (int i = 0; i < firstLength; i += sizeof(int))
196+
{
197+
*(uint*)(output + i) = (uint)Bmi2.X64.ParallelBitExtract(
198+
*(ulong*)(input + i),
199+
0x00FF00FF_00FF00FFul);
200+
}
201+
202+
// Convert the final sequence of 4 from the end.
203+
// This may overlap with the last sequence of the loop, if length is not a multiple of 4.
204+
*(uint*)(output + firstLength) = (uint)Bmi2.X64.ParallelBitExtract(
205+
*(ulong*)(input + firstLength),
206+
0x00FF00FF_00FF00FFul);
208207
}
209-
if (length - 4 > i)
208+
else // 32-bit, 4+ chars
210209
{
211-
ulong inputUlong = *(ulong*)(input + i);
212-
// Pack 8 ASCII chars into 8 bytes
213-
*(uint*)(output + i) =
214-
((uint)((inputUlong * Shift16Shift24) >> 24) & 0xffff) |
215-
((uint)((inputUlong * Shift8Identity) >> 24) & 0xffff0000);
216-
i += 4;
217-
}
210+
// Convert all the 2 char sequences, except final 1 - 2 char sequence
211+
int firstLength = length - sizeof(ushort);
212+
Debug.Assert(firstLength >= 0);
213+
for (int i = 0; i < firstLength; i += sizeof(ushort))
214+
{
215+
*(ushort*)(output + i) = (ushort)Bmi2.ParallelBitExtract(
216+
*(uint*)(input + i),
217+
0x00FF00FFu);
218+
}
218219

219-
trailing:
220-
for (; i < length; i++)
221-
{
222-
char ch = *(input + i);
223-
*(output + i) = (byte)ch; // Cast convert
220+
// Convert the final sequence of 2 from the end.
221+
// This may overlap with the last sequence of the loop, if length is not a multiple of 2
222+
*(ushort*)(output + firstLength) = (ushort)Bmi2.ParallelBitExtract(
223+
*(uint*)(input + firstLength),
224+
0x00FF00FFu);
224225
}
225226
}
226-
else // 32 bit
227+
else
227228
{
228-
// Unaligned chars
229-
if ((unchecked((int)input) & 0x2) != 0)
230-
{
231-
char ch = *input;
232-
i = 1;
233-
*(output) = (byte)ch; // Cast convert
234-
}
229+
const int Shift16Shift24 = (1 << 16) | (1 << 24);
230+
const int Shift8Identity = (1 << 8) | (1);
235231

236-
// Aligned
237-
int uintCount = (length - i) & ~0x3;
238-
for (; i < uintCount; i += 4)
232+
int i = 0;
233+
// Use Intrinsic switch
234+
if (IntPtr.Size == 8) // 64 bit
239235
{
240-
uint inputUint0 = *(uint*)(input + i);
241-
uint inputUint1 = *(uint*)(input + i + 2);
242-
// Pack 4 ASCII chars into 4 bytes
243-
*(ushort*)(output + i) = (ushort)(inputUint0 | (inputUint0 >> 8));
244-
*(ushort*)(output + i + 2) = (ushort)(inputUint1 | (inputUint1 >> 8));
236+
if (length < 4) goto trailing;
237+
238+
int unaligned = (int)(((ulong)input) & 0x7) >> 1;
239+
// Unaligned chars
240+
for (; i < unaligned; i++)
241+
{
242+
char ch = input[i];
243+
output[i] = (byte)ch; // Cast convert
244+
}
245+
246+
// Aligned
247+
int ulongDoubleCount = (length - i) & ~0x7;
248+
for (; i < ulongDoubleCount; i += 8)
249+
{
250+
ulong inputUlong0 = *(ulong*)(input + i);
251+
ulong inputUlong1 = *(ulong*)(input + i + 4);
252+
// Pack 16 ASCII chars into 16 bytes
253+
*(uint*)(output + i) =
254+
((uint)((inputUlong0 * Shift16Shift24) >> 24) & 0xffff) |
255+
((uint)((inputUlong0 * Shift8Identity) >> 24) & 0xffff0000);
256+
*(uint*)(output + i + 4) =
257+
((uint)((inputUlong1 * Shift16Shift24) >> 24) & 0xffff) |
258+
((uint)((inputUlong1 * Shift8Identity) >> 24) & 0xffff0000);
259+
}
260+
if (length - 4 > i)
261+
{
262+
ulong inputUlong = *(ulong*)(input + i);
263+
// Pack 8 ASCII chars into 8 bytes
264+
*(uint*)(output + i) =
265+
((uint)((inputUlong * Shift16Shift24) >> 24) & 0xffff) |
266+
((uint)((inputUlong * Shift8Identity) >> 24) & 0xffff0000);
267+
i += 4;
268+
}
269+
270+
trailing:
271+
for (; i < length; i++)
272+
{
273+
char ch = input[i];
274+
output[i] = (byte)ch; // Cast convert
275+
}
245276
}
246-
if (length - 1 > i)
277+
else // 32 bit
247278
{
248-
uint inputUint = *(uint*)(input + i);
249-
// Pack 2 ASCII chars into 2 bytes
250-
*(ushort*)(output + i) = (ushort)(inputUint | (inputUint >> 8));
251-
i += 2;
252-
}
279+
// Unaligned chars
280+
if ((unchecked((int)input) & 0x2) != 0)
281+
{
282+
char ch = *input;
283+
i = 1;
284+
output[0] = (byte)ch; // Cast convert
285+
}
253286

254-
if (i < length)
255-
{
256-
char ch = *(input + i);
257-
*(output + i) = (byte)ch; // Cast convert
287+
// Aligned
288+
int uintCount = (length - i) & ~0x3;
289+
for (; i < uintCount; i += 4)
290+
{
291+
uint inputUint0 = *(uint*)(input + i);
292+
uint inputUint1 = *(uint*)(input + i + 2);
293+
// Pack 4 ASCII chars into 4 bytes
294+
*(ushort*)(output + i) = (ushort)(inputUint0 | (inputUint0 >> 8));
295+
*(ushort*)(output + i + 2) = (ushort)(inputUint1 | (inputUint1 >> 8));
296+
}
297+
if (length - 1 > i)
298+
{
299+
uint inputUint = *(uint*)(input + i);
300+
// Pack 2 ASCII chars into 2 bytes
301+
*(ushort*)(output + i) = (ushort)(inputUint | (inputUint >> 8));
302+
i += 2;
303+
}
304+
305+
if (i < length)
306+
{
307+
char ch = input[i];
308+
output[i] = (byte)ch; // Cast convert
309+
}
258310
}
259311
}
260312
}

0 commit comments

Comments
 (0)