Skip to content

Commit 4b2077a

Browse files
committed
GODRIVER-1947 Unmarshal unicode surrogate pairs correctly in UnmarshalExtJSON. (#649)
* GODRIVER-1947 Unmarshal unicode surrogate pairs correctly in UnmarshalExtJSON. * Correct err handling in jsonScanner.ScanString and remove unused field in UnmarshalExtJSON test. * Explicitly write unicode.ReplacementChar for invalid surrogate and simplify test types. * Add tests for high surrogate followed by non-Unicode escape sequence and 4-byte UTF-8 extJSON marshaling. * Explicitly write the Unicode replacement character for an un-paired Unicode surrogate value.
1 parent a69cc5e commit 4b2077a

File tree

4 files changed

+204
-22
lines changed

4 files changed

+204
-22
lines changed

bson/bsonrw/json_scanner.go

Lines changed: 95 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ import (
1313
"io"
1414
"math"
1515
"strconv"
16-
"strings"
1716
"unicode"
17+
"unicode/utf16"
1818
)
1919

2020
type jsonTokenType byte
@@ -162,6 +162,31 @@ func isValueTerminator(c byte) bool {
162162
return c == ',' || c == '}' || c == ']' || isWhiteSpace(c)
163163
}
164164

165+
// getu4 decodes the 4-byte hex sequence from the beginning of s, returning the hex value as a rune,
166+
// or it returns -1. Note that the "\u" from the unicode escape sequence should not be present.
167+
// It is copied and lightly modified from the Go JSON decode function at
168+
// https://github.com/golang/go/blob/1b0a0316802b8048d69da49dc23c5a5ab08e8ae8/src/encoding/json/decode.go#L1169-L1188
169+
func getu4(s []byte) rune {
170+
if len(s) < 4 {
171+
return -1
172+
}
173+
var r rune
174+
for _, c := range s[:4] {
175+
switch {
176+
case '0' <= c && c <= '9':
177+
c = c - '0'
178+
case 'a' <= c && c <= 'f':
179+
c = c - 'a' + 10
180+
case 'A' <= c && c <= 'F':
181+
c = c - 'A' + 10
182+
default:
183+
return -1
184+
}
185+
r = r*16 + rune(c)
186+
}
187+
return r
188+
}
189+
165190
// scanString reads from an opening '"' to a closing '"' and handles escaped characters
166191
func (js *jsonScanner) scanString() (*jsonToken, error) {
167192
var b bytes.Buffer
@@ -179,9 +204,18 @@ func (js *jsonScanner) scanString() (*jsonToken, error) {
179204
return nil, err
180205
}
181206

207+
evalNextChar:
182208
switch c {
183209
case '\\':
184210
c, err = js.readNextByte()
211+
if err != nil {
212+
if err == io.EOF {
213+
return nil, errors.New("end of input in JSON string")
214+
}
215+
return nil, err
216+
}
217+
218+
evalNextEscapeChar:
185219
switch c {
186220
case '"', '\\', '/':
187221
b.WriteByte(c)
@@ -202,13 +236,68 @@ func (js *jsonScanner) scanString() (*jsonToken, error) {
202236
return nil, fmt.Errorf("invalid unicode sequence in JSON string: %s", us)
203237
}
204238

205-
s := fmt.Sprintf(`\u%s`, us)
206-
s, err = strconv.Unquote(strings.Replace(strconv.Quote(s), `\\u`, `\u`, 1))
207-
if err != nil {
208-
return nil, err
239+
rn := getu4(us)
240+
241+
// If the rune we just decoded is the high or low value of a possible surrogate pair,
242+
// try to decode the next sequence as the low value of a surrogate pair. We're
243+
// expecting the next sequence to be another Unicode escape sequence (e.g. "\uDD1E"),
244+
// but need to handle cases where the input is not a valid surrogate pair.
245+
// For more context on unicode surrogate pairs, see:
246+
// https://www.christianfscott.com/rust-chars-vs-go-runes/
247+
// https://www.unicode.org/glossary/#high_surrogate_code_point
248+
if utf16.IsSurrogate(rn) {
249+
c, err = js.readNextByte()
250+
if err != nil {
251+
if err == io.EOF {
252+
return nil, errors.New("end of input in JSON string")
253+
}
254+
return nil, err
255+
}
256+
257+
// If the next value isn't the beginning of a backslash escape sequence, write
258+
// the Unicode replacement character for the surrogate value and goto the
259+
// beginning of the next char eval block.
260+
if c != '\\' {
261+
b.WriteRune(unicode.ReplacementChar)
262+
goto evalNextChar
263+
}
264+
265+
c, err = js.readNextByte()
266+
if err != nil {
267+
if err == io.EOF {
268+
return nil, errors.New("end of input in JSON string")
269+
}
270+
return nil, err
271+
}
272+
273+
// If the next value isn't the beginning of a unicode escape sequence, write the
274+
// Unicode replacement character for the surrogate value and goto the beginning
275+
// of the next escape char eval block.
276+
if c != 'u' {
277+
b.WriteRune(unicode.ReplacementChar)
278+
goto evalNextEscapeChar
279+
}
280+
281+
err = js.readNNextBytes(us, 4, 0)
282+
if err != nil {
283+
return nil, fmt.Errorf("invalid unicode sequence in JSON string: %s", us)
284+
}
285+
286+
rn2 := getu4(us)
287+
288+
// Try to decode the pair of runes as a utf16 surrogate pair. If that fails, write
289+
// the Unicode replacement character for the surrogate value and the 2nd decoded rune.
290+
if rnPair := utf16.DecodeRune(rn, rn2); rnPair != unicode.ReplacementChar {
291+
b.WriteRune(rnPair)
292+
} else {
293+
b.WriteRune(unicode.ReplacementChar)
294+
b.WriteRune(rn2)
295+
}
296+
297+
break
209298
}
210299

211-
b.WriteString(s)
300+
b.WriteRune(rn)
212301
default:
213302
return nil, fmt.Errorf("invalid escape sequence in JSON string '\\%c'", c)
214303
}

bson/bsonrw/json_scanner_test.go

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,31 @@ func TestJsonScannerValidInputs(t *testing.T) {
100100
input: `"\"\\\/\b\f\n\r\t"`,
101101
tokens: []jsonToken{{t: jttString, v: "\"\\/\b\f\n\r\t"}},
102102
},
103+
{
104+
desc: "valid string--surrogate pair",
105+
input: `"abc \uD834\uDd1e 123"`,
106+
tokens: []jsonToken{{t: jttString, v: "abc 𝄞 123"}},
107+
},
108+
{
109+
desc: "valid string--high surrogate at end of string",
110+
input: `"abc \uD834"`,
111+
tokens: []jsonToken{{t: jttString, v: "abc �"}},
112+
},
113+
{
114+
desc: "valid string--low surrogate at end of string",
115+
input: `"abc \uDD1E"`,
116+
tokens: []jsonToken{{t: jttString, v: "abc �"}},
117+
},
118+
{
119+
desc: "valid string--high surrogate with non-surrogate Unicode value",
120+
input: `"abc \uDD1E\u00BF"`,
121+
tokens: []jsonToken{{t: jttString, v: "abc �¿"}},
122+
},
123+
{
124+
desc: "valid string--high surrogate with non-Unicode escape sequence",
125+
input: `"abc \uDD1E\t"`,
126+
tokens: []jsonToken{{t: jttString, v: "abc �\t"}},
127+
},
103128
{
104129
desc: "valid literal--true", input: "true",
105130
tokens: []jsonToken{{t: jttBool, v: true}},
@@ -280,28 +305,28 @@ func TestJsonScannerValidInputs(t *testing.T) {
280305

281306
for _, token := range tc.tokens {
282307
c, err := js.nextToken()
308+
expectNoError(t, err, tc.desc)
283309
jttDiff(t, token.t, c.t, tc.desc)
284310
jtvDiff(t, token.v, c.v, tc.desc)
285-
expectNoError(t, err, tc.desc)
286311
}
287312

288313
c, err := js.nextToken()
289-
jttDiff(t, jttEOF, c.t, tc.desc)
290314
noerr(t, err)
315+
jttDiff(t, jttEOF, c.t, tc.desc)
291316

292317
// testing early EOF reading
293318
js = &jsonScanner{r: iotest.DataErrReader(strings.NewReader(tc.input))}
294319

295320
for _, token := range tc.tokens {
296321
c, err := js.nextToken()
322+
expectNoError(t, err, tc.desc)
297323
jttDiff(t, token.t, c.t, tc.desc)
298324
jtvDiff(t, token.v, c.v, tc.desc)
299-
expectNoError(t, err, tc.desc)
300325
}
301326

302327
c, err = js.nextToken()
303-
jttDiff(t, jttEOF, c.t, tc.desc)
304328
noerr(t, err)
329+
jttDiff(t, jttEOF, c.t, tc.desc)
305330
})
306331
}
307332
}

bson/unmarshal_test.go

Lines changed: 75 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -95,18 +95,81 @@ func TestUnmarshalExtJSONWithRegistry(t *testing.T) {
9595
}
9696

9797
func TestUnmarshalExtJSONWithContext(t *testing.T) {
98-
t.Run("UnmarshalExtJSONWithContext", func(t *testing.T) {
99-
type teststruct struct{ Foo int }
100-
var got teststruct
101-
data := []byte("{\"foo\":1}")
102-
dc := bsoncodec.DecodeContext{Registry: DefaultRegistry}
103-
err := UnmarshalExtJSONWithContext(dc, data, true, &got)
104-
noerr(t, err)
105-
want := teststruct{1}
106-
if !cmp.Equal(got, want) {
107-
t.Errorf("Did not unmarshal as expected. got %v; want %v", got, want)
108-
}
109-
})
98+
type fooInt struct {
99+
Foo int
100+
}
101+
102+
type fooString struct {
103+
Foo string
104+
}
105+
106+
var cases = []struct {
107+
name string
108+
sType reflect.Type
109+
want interface{}
110+
data []byte
111+
}{
112+
{
113+
name: "Small struct",
114+
sType: reflect.TypeOf(fooInt{}),
115+
data: []byte(`{"foo":1}`),
116+
want: &fooInt{Foo: 1},
117+
},
118+
{
119+
name: "Valid surrogate pair",
120+
sType: reflect.TypeOf(fooString{}),
121+
data: []byte(`{"foo":"\uD834\uDd1e"}`),
122+
want: &fooString{Foo: "𝄞"},
123+
},
124+
{
125+
name: "Valid surrogate pair with other values",
126+
sType: reflect.TypeOf(fooString{}),
127+
data: []byte(`{"foo":"abc \uD834\uDd1e 123"}`),
128+
want: &fooString{Foo: "abc 𝄞 123"},
129+
},
130+
{
131+
name: "High surrogate value with no following low surrogate value",
132+
sType: reflect.TypeOf(fooString{}),
133+
data: []byte(`{"foo":"abc \uD834 123"}`),
134+
want: &fooString{Foo: "abc � 123"},
135+
},
136+
{
137+
name: "High surrogate value at end of string",
138+
sType: reflect.TypeOf(fooString{}),
139+
data: []byte(`{"foo":"\uD834"}`),
140+
want: &fooString{Foo: "�"},
141+
},
142+
{
143+
name: "Low surrogate value with no preceeding high surrogate value",
144+
sType: reflect.TypeOf(fooString{}),
145+
data: []byte(`{"foo":"abc \uDd1e 123"}`),
146+
want: &fooString{Foo: "abc � 123"},
147+
},
148+
{
149+
name: "Low surrogate value at end of string",
150+
sType: reflect.TypeOf(fooString{}),
151+
data: []byte(`{"foo":"\uDd1e"}`),
152+
want: &fooString{Foo: "�"},
153+
},
154+
{
155+
name: "High surrogate value with non-surrogate unicode value",
156+
sType: reflect.TypeOf(fooString{}),
157+
data: []byte(`{"foo":"\uD834\u00BF"}`),
158+
want: &fooString{Foo: "�¿"},
159+
},
160+
}
161+
162+
for _, tc := range cases {
163+
t.Run(tc.name, func(t *testing.T) {
164+
got := reflect.New(tc.sType).Interface()
165+
dc := bsoncodec.DecodeContext{Registry: DefaultRegistry}
166+
err := UnmarshalExtJSONWithContext(dc, tc.data, true, got)
167+
noerr(t, err)
168+
if !cmp.Equal(got, tc.want) {
169+
t.Errorf("Did not unmarshal as expected. got %+v; want %+v", got, tc.want)
170+
}
171+
})
172+
}
110173
}
111174

112175
func TestCachingDecodersNotSharedAcrossRegistries(t *testing.T) {

data/bson-corpus/string.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,11 @@
2828
"canonical_bson": "190000000261000D000000E29886E29886E29886E298860000",
2929
"canonical_extjson": "{\"a\" : \"\\u2606\\u2606\\u2606\\u2606\"}"
3030
},
31+
{
32+
"description": "four-byte UTF-8 (𝄞)",
33+
"canonical_bson": "1100000002610005000000F09D849E0000",
34+
"canonical_extjson": "{\"a\" : \"𝄞\"}"
35+
},
3136
{
3237
"description": "Embedded nulls",
3338
"canonical_bson": "190000000261000D0000006162006261620062616261620000",

0 commit comments

Comments
 (0)