Skip to content

Commit 98e9285

Browse files
committed
#13 - Detailed UTF-8 validation.
1 parent ae77def commit 98e9285

File tree

2 files changed

+68
-30
lines changed

2 files changed

+68
-30
lines changed

src/Microsoft.AspNet.WebSockets.Protocol/Utilities.cs

Lines changed: 39 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -70,49 +70,52 @@ public static WebSocketMessageType GetMessageType(int opCode)
7070
}
7171
}
7272

73-
// For now this is stateless and does not handle sequences spliced across messages.
74-
// http://etutorials.org/Programming/secure+programming/Chapter+3.+Input+Validation/3.12+Detecting+Illegal+UTF-8+Characters/
73+
// Performs a stateful validation of UTF-8 bytes.
74+
// It checks for valid formatting, overlong encodings, surrogates, and value ranges.
7575
public static bool TryValidateUtf8(ArraySegment<byte> arraySegment, bool endOfMessage, Utf8MessageState state)
7676
{
7777
for (int i = arraySegment.Offset; i < arraySegment.Offset + arraySegment.Count; )
7878
{
79+
// Have we started a character sequence yet?
7980
if (!state.SequenceInProgress)
8081
{
82+
// The first byte tells us how many bytes are in the sequence.
8183
state.SequenceInProgress = true;
8284
byte b = arraySegment.Array[i];
85+
i++;
8386
if ((b & 0x80) == 0) // 0bbbbbbb, single byte
8487
{
8588
state.AdditionalBytesExpected = 0;
89+
state.CurrentDecodeBits = b & 0x7F;
90+
state.ExpectedValueMin = 0;
8691
}
8792
else if ((b & 0xC0) == 0x80)
8893
{
89-
return false; // Misplaced 10bbbbbb byte. This cannot be the first byte.
94+
// Misplaced 10bbbbbb continuation byte. This cannot be the first byte.
95+
return false;
9096
}
9197
else if ((b & 0xE0) == 0xC0) // 110bbbbb 10bbbbbb
9298
{
9399
state.AdditionalBytesExpected = 1;
100+
state.CurrentDecodeBits = b & 0x1F;
101+
state.ExpectedValueMin = 0x80;
94102
}
95103
else if ((b & 0xF0) == 0xE0) // 1110bbbb 10bbbbbb 10bbbbbb
96104
{
97105
state.AdditionalBytesExpected = 2;
106+
state.CurrentDecodeBits = b & 0xF;
107+
state.ExpectedValueMin = 0x800;
98108
}
99109
else if ((b & 0xF8) == 0xF0) // 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb
100110
{
101111
state.AdditionalBytesExpected = 3;
112+
state.CurrentDecodeBits = b & 0x7;
113+
state.ExpectedValueMin = 0x10000;
102114
}
103-
else if ((b & 0xFC) == 0xF8) // 111110bb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb
104-
{
105-
state.AdditionalBytesExpected = 4;
106-
}
107-
else if ((b & 0xFE) == 0xFC) // 1111110b 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb
108-
{
109-
state.AdditionalBytesExpected = 5;
110-
}
111-
else // 11111110 && 11111111 are not valid
115+
else // 111110bb & 1111110b & 11111110 && 11111111 are not valid
112116
{
113117
return false;
114118
}
115-
i++;
116119
}
117120
while (state.AdditionalBytesExpected > 0 && i < arraySegment.Offset + arraySegment.Count)
118121
{
@@ -121,12 +124,32 @@ public static bool TryValidateUtf8(ArraySegment<byte> arraySegment, bool endOfMe
121124
{
122125
return false;
123126
}
124-
state.AdditionalBytesExpected--;
127+
125128
i++;
129+
state.AdditionalBytesExpected--;
130+
131+
// Each continuation byte carries 6 bits of data 0x10bbbbbb.
132+
state.CurrentDecodeBits = (state.CurrentDecodeBits << 6) | b & 0x3F;
133+
134+
if (state.AdditionalBytesExpected == 1 && state.CurrentDecodeBits >= 0x360 && state.CurrentDecodeBits <= 0x37F)
135+
{
136+
// This is going to end up in the range of 0xD800-0xDFFF UTF-16 surrogates that are not allowed in UTF-8;
137+
return false;
138+
}
139+
if (state.AdditionalBytesExpected == 2 && state.CurrentDecodeBits >= 0x110)
140+
{
141+
// This is going to be out of the upper Unicode bound 0x10FFFF.
142+
return false;
143+
}
126144
}
127145
if (state.AdditionalBytesExpected == 0)
128146
{
129147
state.SequenceInProgress = false;
148+
if (state.CurrentDecodeBits < state.ExpectedValueMin)
149+
{
150+
// Overlong encoding (e.g. using 2 bytes to encode something that only needed 1).
151+
return false;
152+
}
130153
}
131154
}
132155
if (endOfMessage && state.SequenceInProgress)
@@ -140,6 +163,8 @@ public class Utf8MessageState
140163
{
141164
public bool SequenceInProgress { get; set; }
142165
public int AdditionalBytesExpected { get; set; }
166+
public int ExpectedValueMin { get; set; }
167+
public int CurrentDecodeBits { get; set; }
143168
}
144169
}
145170
}

test/Microsoft.AspNet.WebSockets.Protocol.Test/Utf8ValidationTests.cs

Lines changed: 29 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ public class Utf8ValidationTests
1313
[InlineData(new byte[] { })]
1414
[InlineData(new byte[] { 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x57, 0x6F, 0x72, 0x6C, 0x64 })] // Hello World
1515
[InlineData(new byte[] { 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x2D, 0xC2, 0xB5, 0x40, 0xC3, 0x9F, 0xC3, 0xB6, 0xC3, 0xA4, 0xC3, 0xBC, 0xC3, 0xA0, 0xC3, 0xA1 })] // "Hello-µ@ßöäüàá";
16+
// [InlineData(new byte[] { 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0xf0, 0xa4, 0xad, 0xa2, 0x77, 0x6f, 0x72, 0x6c, 0x64 })] // "hello\U00024b62world"
17+
[InlineData(new byte[] { 0xf0, 0xa4, 0xad, 0xa2 })] // "\U00024b62"
1618
public void ValidateSingleValidSegments_Valid(byte[] data)
1719
{
1820
var state = new Utilities.Utf8MessageState();
@@ -35,28 +37,39 @@ public void ValidateMultipleValidSegments_Valid(byte[] data1, byte[] data2, byte
3537
[InlineData(new byte[] { 0xfe })]
3638
[InlineData(new byte[] { 0xff })]
3739
[InlineData(new byte[] { 0xfe, 0xfe, 0xff, 0xff })]
38-
// [InlineData(new byte[] { 0xc0, 0xaf })]
39-
// [InlineData(new byte[] { 0xe0, 0x80, 0xaf })]
40-
// [InlineData(new byte[] { 0xf4, 0x90, 0x80, 0x80 })]
41-
// [InlineData(new byte[] { 0xf0, 0x80, 0x80, 0xaf })]
42-
// [InlineData(new byte[] { 0xf8, 0x80, 0x80, 0x80, 0xaf })]
43-
// [InlineData(new byte[] { 0xfc, 0x80, 0x80, 0x80, 0x80, 0xaf })]
44-
// [InlineData(new byte[] { 0xc1, 0xbf })]
45-
// [InlineData(new byte[] { 0xed, 0xa0, 0x80, 0x65, 0x64, 0x69, 0x74, 0x65, 0x64 })] // 0xEDA080 decodes to 0xD800, which is a reserved high surrogate character.
40+
[InlineData(new byte[] { 0xc0, 0xb1 })] // Overlong Ascii
41+
[InlineData(new byte[] { 0xc1, 0xb1 })] // Overlong Ascii
42+
[InlineData(new byte[] { 0xe0, 0x80, 0xaf })] // Overlong
43+
[InlineData(new byte[] { 0xf0, 0x80, 0x80, 0xaf })] // Overlong
44+
[InlineData(new byte[] { 0xf8, 0x80, 0x80, 0x80, 0xaf })] // Overlong
45+
[InlineData(new byte[] { 0xfc, 0x80, 0x80, 0x80, 0x80, 0xaf })] // Overlong
46+
[InlineData(new byte[] { 0xed, 0xa0, 0x80, 0x65, 0x64, 0x69, 0x74, 0x65, 0x64 })] // 0xEDA080 decodes to 0xD800, which is a reserved high surrogate character.
4647
public void ValidateSingleInvalidSegment_Invalid(byte[] data)
4748
{
4849
var state = new Utilities.Utf8MessageState();
4950
Assert.False(Utilities.TryValidateUtf8(new ArraySegment<byte>(data), endOfMessage: true, state: state));
5051
}
51-
/*
52-
[Theory]
53-
// [InlineData(true, new byte[] { 0xce, 0xba, 0xe1, 0xbd, 0xb9, 0xcf, 0x83, 0xce, 0xbc, 0xce, 0xb5, 0xf4 }, false, new byte[] { 0x90 }, true, new byte[] { })]
54-
public void ValidateMultipleInvalidSegments_Invalid(bool valid1, byte[] data1, bool valid2, byte[] data2, bool valid3, byte[] data3)
52+
53+
[Fact]
54+
public void ValidateIndividualInvalidSegments_Invalid()
5555
{
56+
var data = new byte[] { 0xce, 0xba, 0xe1, 0xbd, 0xb9, 0xcf, 0x83, 0xce, 0xbc, 0xce, 0xb5, 0xed, 0xa0, 0x80, 0x65, 0x64, 0x69, 0x74, 0x65, 0x64 };
5657
var state = new Utilities.Utf8MessageState();
57-
Assert.True(valid1 == Utilities.TryValidateUtf8(new ArraySegment<byte>(data1), endOfMessage: false, state: state), "1st");
58-
Assert.True(valid2 == Utilities.TryValidateUtf8(new ArraySegment<byte>(data2), endOfMessage: false, state: state), "2nd");
59-
Assert.True(valid3 == Utilities.TryValidateUtf8(new ArraySegment<byte>(data3), endOfMessage: true, state: state), "3rd");
60-
}*/
58+
for (int i = 0; i < 12; i++)
59+
{
60+
Assert.True(Utilities.TryValidateUtf8(new ArraySegment<byte>(data, i, 1), endOfMessage: false, state: state), i.ToString());
61+
}
62+
Assert.False(Utilities.TryValidateUtf8(new ArraySegment<byte>(data, 12, 1), endOfMessage: false, state: state), 12.ToString());
63+
}
64+
65+
[Fact]
66+
public void ValidateMultipleInvalidSegments_Invalid()
67+
{
68+
var data0 = new byte[] { 0xce, 0xba, 0xe1, 0xbd, 0xb9, 0xcf, 0x83, 0xce, 0xbc, 0xce, 0xb5, 0xf4 };
69+
var data1 = new byte[] { 0x90 };
70+
var state = new Utilities.Utf8MessageState();
71+
Assert.True(Utilities.TryValidateUtf8(new ArraySegment<byte>(data0), endOfMessage: false, state: state));
72+
Assert.False(Utilities.TryValidateUtf8(new ArraySegment<byte>(data1), endOfMessage: false, state: state));
73+
}
6174
}
6275
}

0 commit comments

Comments
 (0)