Skip to content

Commit 9e0c8da

Browse files
authored
Multi-lined envelope reading corruption caused by direct reference into bufio.Reader's internal buffer rollover. (#214)
BUG: #213 If we're dealing with multi-lined envelope (either by rows or by header/footer), readLine() will be called several times, thus whatever ios.ByteReadLine, which uses bufio.Reader underneath, returns in a previous call may be potentially be invalidated due to bufio.Reader's internal buf rollover. If we read the previous line directly, it would cause corruption. To fix the problem the easiest solution would be simply copying the return []byte from ios.ByteReadLine every single time. But for files with single-line envelope, which are the vast majority cases, this copy becomes unnecessary and burdensome on gc. So the trick is to has a flag on reader.linesBuf's last element to tell if it contains a reference into the bufio.Reader's internal buffer, or it's a copy. Every time before we call bufio.Reader read, we check reader.liensBuf's last element flag, if it is not a copy, then we will turn it into a copy. This way, we optimize for the vast majority cases without needing allocations, and avoid any potential corruptions in the multi-lined envelope cases.
1 parent 37370fa commit 9e0c8da

File tree

3 files changed

+68
-8
lines changed

3 files changed

+68
-8
lines changed

extensions/omniv21/fileformat/flatfile/fixedlength/reader.go

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,9 @@ import (
1414
)
1515

1616
type line struct {
17-
lineNum int // 1-based
18-
b []byte
17+
lineNum int // 1-based
18+
b []byte // either a copy of a line content or a direct ref into bufio.Reader.
19+
copied bool // see notes in reader.readLine()
1920
}
2021

2122
type reader struct {
@@ -148,9 +149,37 @@ func (r *reader) readAndMatchHeaderFooterBasedEnvelope(
148149
}
149150

150151
func (r *reader) readLine() error {
152+
// https://github.com/jf-tech/omniparser/issues/213
153+
//
154+
// If we're dealing with multi-lined envelope (either by rows or by header/footer), this
155+
// readLine() will be called several times, thus whatever ios.ByteReadLine, which uses
156+
// bufio.Reader underneath, returns in a previous call may be potentially be invalidated due to
157+
// bufio.Reader's internal buf rollover. If we read the previous line directly, it would cause
158+
// corruption.
159+
//
160+
// To fix the problem the easiest solution would be simply copying the return []byte from
161+
// ios.ByteReadLine every single time. But for files with single-line envelope, which are the
162+
// vast majority cases, this copy becomes unnecessary and burdensome on gc. So the trick is to
163+
// has a flag on reader.linesBuf's last element to tell if it contains a reference into the
164+
// bufio.Reader's internal buffer, or it's a copy. Every time before we call bufio.Reader read,
165+
// we check reader.liensBuf's last element flag, if it is not a copy, then we will turn it into
166+
// a copy.
167+
//
168+
// This way, we optimize for the vast majority cases without
169+
// needing allocations, and avoid any potential corruptions in the multi-lined envelope cases.
170+
linesBufLen := len(r.linesBuf)
171+
if linesBufLen > 0 && !r.linesBuf[linesBufLen-1].copied {
172+
cp := make([]byte, len(r.linesBuf[linesBufLen-1].b))
173+
copy(cp, r.linesBuf[linesBufLen-1].b)
174+
r.linesBuf[linesBufLen-1].b = cp
175+
r.linesBuf[linesBufLen-1].copied = true
176+
}
151177
for {
152-
// note1: ios.ByteReadLine returns a ln with trailing '\n' (and/or '\r') dropped.
153-
// note2: ios.ByteReadLine won't return io.EOF if ln returned isn't empty.
178+
// note1: ios.ByteReadLine returns a line with trailing '\n' (and/or '\r') dropped.
179+
// note2: ios.ByteReadLine won't return io.EOF if line returned isn't empty.
180+
// note3: ios.ByteReadLine's returned []byte is merely pointing into the bufio.Reader's
181+
// internal buffer, thus the content will be invalided if ios.ByteReadLine is called
182+
// again. Caution!
154183
b, err := ios.ByteReadLine(r.r)
155184
switch {
156185
case err == io.EOF:

extensions/omniv21/fileformat/flatfile/fixedlength/reader_test.go

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,37 @@ func TestReadLine(t *testing.T) {
412412
assert.Equal(t, line{lineNum: 45, b: []byte("a new line")}, r.linesBuf[2])
413413
}
414414

415+
func TestReadLineMultiCalls(t *testing.T) {
416+
// See more details about the bug: https://github.com/jf-tech/omniparser/issues/213
417+
r := &reader{inputName: "test-input"}
418+
// construct a two-line input, and total length of the two line input is longer than
419+
// the bufio.Reader's internal buffer. Note bufio.Reader has a minReadBufferSize at 16.
420+
r.r = bufio.NewReaderSize(strings.NewReader("0123456789\nabcdefghijklmnopq\n!@#"), 16)
421+
// First read should be completely normal and fine.
422+
err := r.readLine()
423+
assert.NoError(t, err)
424+
assert.Equal(t, 1, r.linesRead)
425+
assert.Equal(t, 1, len(r.linesBuf))
426+
assert.Equal(t, line{lineNum: 1, b: []byte("0123456789"), copied: false}, r.linesBuf[0])
427+
// Now the second read, if we didn't do any copying of the first line result, the first line
428+
// would have become corrupted since the bufio.Reader's tiny internal buffer gets overwritten
429+
// by the new line data. With the fix, both lines in the linesBuf should be without corruption.
430+
err = r.readLine()
431+
assert.NoError(t, err)
432+
assert.Equal(t, 2, r.linesRead)
433+
assert.Equal(t, 2, len(r.linesBuf))
434+
assert.Equal(t, line{lineNum: 1, b: []byte("0123456789"), copied: true}, r.linesBuf[0])
435+
assert.Equal(t, line{lineNum: 2, b: []byte("abcdefghijklmnopq"), copied: false}, r.linesBuf[1])
436+
// 3rd line reading
437+
err = r.readLine()
438+
assert.NoError(t, err)
439+
assert.Equal(t, 3, r.linesRead)
440+
assert.Equal(t, 3, len(r.linesBuf))
441+
assert.Equal(t, line{lineNum: 1, b: []byte("0123456789"), copied: true}, r.linesBuf[0])
442+
assert.Equal(t, line{lineNum: 2, b: []byte("abcdefghijklmnopq"), copied: true}, r.linesBuf[1])
443+
assert.Equal(t, line{lineNum: 3, b: []byte("!@#"), copied: false}, r.linesBuf[2])
444+
}
445+
415446
func TestLinesToNode(t *testing.T) {
416447
for _, test := range []struct {
417448
name string

extensions/omniv21/samples/fixedlength2/fixedlength_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -108,22 +108,22 @@ func Test4_Nested(t *testing.T) {
108108
tests[test4_Nested].doTest(t)
109109
}
110110

111-
// Benchmark1_Single_Row-8 25898 45728 ns/op 28169 B/op 647 allocs/op
111+
// Benchmark1_Single_Row-8 25951 45776 ns/op 28213 B/op 645 allocs/op
112112
func Benchmark1_Single_Row(b *testing.B) {
113113
tests[test1_Single_Row].doBenchmark(b)
114114
}
115115

116-
// Benchmark2_Multi_Rows-8 15338 78273 ns/op 30179 B/op 618 allocs/op
116+
// Benchmark2_Multi_Rows-8 18583 64240 ns/op 34056 B/op 636 allocs/op
117117
func Benchmark2_Multi_Rows(b *testing.B) {
118118
tests[test2_Multi_Rows].doBenchmark(b)
119119
}
120120

121-
// Benchmark3_Header_Footer-8 6390 178301 ns/op 76571 B/op 1501 allocs/op
121+
// Benchmark3_Header_Footer-8 7306 158797 ns/op 78666 B/op 1587 allocs/op
122122
func Benchmark3_Header_Footer(b *testing.B) {
123123
tests[test3_Header_Footer].doBenchmark(b)
124124
}
125125

126-
// Benchmark4_Nested-8 10000 107948 ns/op 78986 B/op 1535 allocs/op
126+
// Benchmark4_Nested-8 10000 107955 ns/op 80929 B/op 1515 allocs/op
127127
func Benchmark4_Nested(b *testing.B) {
128128
tests[test4_Nested].doBenchmark(b)
129129
}

0 commit comments

Comments
 (0)